From 8ab934fe1c08ec274a80329bf170044ca4905846 Mon Sep 17 00:00:00 2001
From: nuluh <dam.ar@outlook.com>
Date: Tue, 20 Aug 2024 11:27:02 +0700
Subject: [PATCH] feat(features): refactor feature extraction to handle
 multiple files and directories

- Modify `build_features` function to support iterative processing across nested directories, enhancing the system's ability to handle larger datasets and varied input structures.
- Replace direct usage of `FeatureExtractor` class with `ExtractTimeFeatures` function, which now acts as a wrapper to include this class, facilitating streamlined integration and maintenance of feature extraction processes.
- Implement `extract_numbers` function using regex to parse filenames and extract numeric identifiers, used for labels when training with SVM
- Switch output from `.npz` to `.csv` format in `build_features`, offering better compatibility with data analysis tools and readability.
- Update documentation and comments within the code to reflect changes in functionality and usage of the new feature extraction setup.

Closes #4
---
 code/src/features/build_features.py | 41 ++++++++++++++++++++++-------
 1 file changed, 32 insertions(+), 9 deletions(-)

diff --git a/code/src/features/build_features.py b/code/src/features/build_features.py
index 017f26c..8ecd03c 100644
--- a/code/src/features/build_features.py
+++ b/code/src/features/build_features.py
@@ -1,16 +1,39 @@
 # src/features/build_features.py
 import pandas as pd
-from time_domain_features import FeatureExtractor
-import numpy as np
+from time_domain_features import ExtractTimeFeatures
+import os
+import re
 
-def build_features(input_file, output_file):
-    data = pd.read_csv(input_file)
-    # Assuming the relevant data is in the first column
-    extractor = FeatureExtractor(data.iloc[:, 0].values)
-    features = extractor.features
+# define function, regex pattern for extracting the damage level and test number store in pairs array
+def extract_numbers(filename):
+    # Find all occurrences of one or more digits in the filename
+    numbers = re.findall(r'\d+', filename)
+    # Convert the list of number strings to integers
+    numbers = [int(num) for num in numbers]
+    # Convert to a tuple and return
+    return print(tuple(numbers))
 
+def build_features(input_dir, output_dir):
+    all_features = []
+    for nth_damage in os.listdir(input_dir):
+        nth_damage_path = os.path.join(input_dir, nth_damage)
+        if os.path.isdir(nth_damage_path):
+            print(nth_damage)
+            for nth_test in os.listdir(nth_damage_path):
+                nth_test_path = os.path.join(nth_damage_path, nth_test)
+                # print(nth_test_path)
+                features = ExtractTimeFeatures(nth_test_path) # return the one csv file feature in dictionary {}
+                all_features.append(features)
+
+    # Create a DataFrame from the list of dictionaries
+    df = pd.DataFrame(all_features)
+    print(df)
+    # Save the DataFrame to a CSV file in the output directory
+    output_file_path = os.path.join(output_dir, 'combined_features.csv')
+    df.to_csv(output_file_path, index=False)
+    print(f"Features saved to {output_file_path}")
     # Save features to a file
-    np.savez(output_file, **features)
+    # np.savez(output_file, **features)
 
 if __name__ == "__main__":
     import sys
@@ -18,4 +41,4 @@ if __name__ == "__main__":
     output_path = sys.argv[2]  # 'data/features/feature_matrix.npz'
     
     # Assuming only one file for simplicity; adapt as needed
-    build_features(f"{input_path}processed_data.csv", output_path)
+    build_features(input_path, output_path)