From 8ab934fe1c08ec274a80329bf170044ca4905846 Mon Sep 17 00:00:00 2001 From: nuluh Date: Tue, 20 Aug 2024 11:27:02 +0700 Subject: [PATCH] feat(features): refactor feature extraction to handle multiple files and directories - Modify `build_features` function to support iterative processing across nested directories, enhancing the system's ability to handle larger datasets and varied input structures. - Replace direct usage of `FeatureExtractor` class with `ExtractTimeFeatures` function, which now acts as a wrapper to include this class, facilitating streamlined integration and maintenance of feature extraction processes. - Implement `extract_numbers` function using regex to parse filenames and extract numeric identifiers, used for labels when training with SVM - Switch output from `.npz` to `.csv` format in `build_features`, offering better compatibility with data analysis tools and readability. - Update documentation and comments within the code to reflect changes in functionality and usage of the new feature extraction setup. Closes #4 --- code/src/features/build_features.py | 41 ++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/code/src/features/build_features.py b/code/src/features/build_features.py index 017f26c..8ecd03c 100644 --- a/code/src/features/build_features.py +++ b/code/src/features/build_features.py @@ -1,16 +1,39 @@ # src/features/build_features.py import pandas as pd -from time_domain_features import FeatureExtractor -import numpy as np +from time_domain_features import ExtractTimeFeatures +import os +import re -def build_features(input_file, output_file): - data = pd.read_csv(input_file) - # Assuming the relevant data is in the first column - extractor = FeatureExtractor(data.iloc[:, 0].values) - features = extractor.features +# define function, regex pattern for extracting the damage level and test number store in pairs array +def extract_numbers(filename): + # Find all occurrences of one or more digits in the filename + numbers = re.findall(r'\d+', filename) + # Convert the list of number strings to integers + numbers = [int(num) for num in numbers] + # Convert to a tuple and return + return print(tuple(numbers)) +def build_features(input_dir, output_dir): + all_features = [] + for nth_damage in os.listdir(input_dir): + nth_damage_path = os.path.join(input_dir, nth_damage) + if os.path.isdir(nth_damage_path): + print(nth_damage) + for nth_test in os.listdir(nth_damage_path): + nth_test_path = os.path.join(nth_damage_path, nth_test) + # print(nth_test_path) + features = ExtractTimeFeatures(nth_test_path) # return the one csv file feature in dictionary {} + all_features.append(features) + + # Create a DataFrame from the list of dictionaries + df = pd.DataFrame(all_features) + print(df) + # Save the DataFrame to a CSV file in the output directory + output_file_path = os.path.join(output_dir, 'combined_features.csv') + df.to_csv(output_file_path, index=False) + print(f"Features saved to {output_file_path}") # Save features to a file - np.savez(output_file, **features) + # np.savez(output_file, **features) if __name__ == "__main__": import sys @@ -18,4 +41,4 @@ if __name__ == "__main__": output_path = sys.argv[2] # 'data/features/feature_matrix.npz' # Assuming only one file for simplicity; adapt as needed - build_features(f"{input_path}processed_data.csv", output_path) + build_features(input_path, output_path)