feat(features): integrate time-domain feature extraction into data pipeline

- Implement FeatureExtractor class in time_domain_features.py for calculating statistical features from dataset columns. - Create build_features.py script to automate feature extraction from processed data and save results in a structured format. - Adjust build_features.py to read processed data, utilize FeatureExtractor, and save feature matrix. This update supports enhanced analysis capabilities within the thesis-project structure, allowing for more sophisticated data processing and model training stages. Closes #1
2024-08-12 19:45:19 +07:00
parent 7d39176e27
commit a401d620eb
2 changed files with 30 additions and 6 deletions
--- a/code/src/features/build_features.py
+++ b/code/src/features/build_features.py
@@ -0,0 +1,21 @@
+# src/features/build_features.py
+import pandas as pd
+from time_domain_features import FeatureExtractor
+import numpy as np
+
+def build_features(input_file, output_file):
+    data = pd.read_csv(input_file)
+    # Assuming the relevant data is in the first column
+    extractor = FeatureExtractor(data.iloc[:, 0].values)
+    features = extractor.features
+
+    # Save features to a file
+    np.savez(output_file, **features)
+
+if __name__ == "__main__":
+    import sys
+    input_path = sys.argv[1]  # 'data/processed/'
+    output_path = sys.argv[2]  # 'data/features/feature_matrix.npz'
+    
+    # Assuming only one file for simplicity; adapt as needed
+    build_features(f"{input_path}processed_data.csv", output_path)
--- a/code/src/features/time_domain_features.py
+++ b/code/src/features/time_domain_features.py
@@ -3,14 +3,16 @@ import pandas as pd
 from scipy.stats import kurtosis, skew

 class FeatureExtractor:
-    def __init__(self, file_path):
-        # Read data from CSV file
-        self.data = pd.read_csv(file_path)
-        # Assuming the data to analyze is in the first column
-        self.x = self.data.iloc[:, 0].values
+    # integrates the feature extraction into your project's existing data processing pipeline
+    def __init__(self, data):
+        # Assuming data is a numpy array
+        self.x = data
+        # Calculate features
+        self.features = self.calculate_features()

        # Calculate all features
-        self.features = {
+    def calculate_features(self):
+        features = {
            'Mean': np.mean(self.x),
            'Max': np.max(self.x),
            'Peak (Pm)': np.max(np.abs(self.x)),
@@ -26,6 +28,7 @@ class FeatureExtractor:
            'Kurtosis': kurtosis(self.x, fisher=False),
            'Skewness': skew(self.x, bias=False)
        }
+        return features

    def __repr__(self):
        result = "Feature Extraction Results:\n"