feat(features): integrate time-domain feature extraction into data pipeline
- Implement FeatureExtractor class in time_domain_features.py for calculating statistical features from dataset columns. - Create build_features.py script to automate feature extraction from processed data and save results in a structured format. - Adjust build_features.py to read processed data, utilize FeatureExtractor, and save feature matrix. This update supports enhanced analysis capabilities within the thesis-project structure, allowing for more sophisticated data processing and model training stages. Closes #1
This commit is contained in:
21
code/src/features/build_features.py
Normal file
21
code/src/features/build_features.py
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
# src/features/build_features.py
|
||||||
|
import pandas as pd
|
||||||
|
from time_domain_features import FeatureExtractor
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
def build_features(input_file, output_file):
|
||||||
|
data = pd.read_csv(input_file)
|
||||||
|
# Assuming the relevant data is in the first column
|
||||||
|
extractor = FeatureExtractor(data.iloc[:, 0].values)
|
||||||
|
features = extractor.features
|
||||||
|
|
||||||
|
# Save features to a file
|
||||||
|
np.savez(output_file, **features)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import sys
|
||||||
|
input_path = sys.argv[1] # 'data/processed/'
|
||||||
|
output_path = sys.argv[2] # 'data/features/feature_matrix.npz'
|
||||||
|
|
||||||
|
# Assuming only one file for simplicity; adapt as needed
|
||||||
|
build_features(f"{input_path}processed_data.csv", output_path)
|
||||||
@@ -3,14 +3,16 @@ import pandas as pd
|
|||||||
from scipy.stats import kurtosis, skew
|
from scipy.stats import kurtosis, skew
|
||||||
|
|
||||||
class FeatureExtractor:
|
class FeatureExtractor:
|
||||||
def __init__(self, file_path):
|
# integrates the feature extraction into your project's existing data processing pipeline
|
||||||
# Read data from CSV file
|
def __init__(self, data):
|
||||||
self.data = pd.read_csv(file_path)
|
# Assuming data is a numpy array
|
||||||
# Assuming the data to analyze is in the first column
|
self.x = data
|
||||||
self.x = self.data.iloc[:, 0].values
|
# Calculate features
|
||||||
|
self.features = self.calculate_features()
|
||||||
|
|
||||||
# Calculate all features
|
# Calculate all features
|
||||||
self.features = {
|
def calculate_features(self):
|
||||||
|
features = {
|
||||||
'Mean': np.mean(self.x),
|
'Mean': np.mean(self.x),
|
||||||
'Max': np.max(self.x),
|
'Max': np.max(self.x),
|
||||||
'Peak (Pm)': np.max(np.abs(self.x)),
|
'Peak (Pm)': np.max(np.abs(self.x)),
|
||||||
@@ -26,6 +28,7 @@ class FeatureExtractor:
|
|||||||
'Kurtosis': kurtosis(self.x, fisher=False),
|
'Kurtosis': kurtosis(self.x, fisher=False),
|
||||||
'Skewness': skew(self.x, bias=False)
|
'Skewness': skew(self.x, bias=False)
|
||||||
}
|
}
|
||||||
|
return features
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
result = "Feature Extraction Results:\n"
|
result = "Feature Extraction Results:\n"
|
||||||
|
|||||||
Reference in New Issue
Block a user