feat(features): refactor feature extraction to handle multiple files and directories
- Modify `build_features` function to support iterative processing across nested directories, enhancing the system's ability to handle larger datasets and varied input structures. - Replace direct usage of `FeatureExtractor` class with `ExtractTimeFeatures` function, which now acts as a wrapper to include this class, facilitating streamlined integration and maintenance of feature extraction processes. - Implement `extract_numbers` function using regex to parse filenames and extract numeric identifiers, used for labels when training with SVM - Switch output from `.npz` to `.csv` format in `build_features`, offering better compatibility with data analysis tools and readability. - Update documentation and comments within the code to reflect changes in functionality and usage of the new feature extraction setup. Closes #4
This commit is contained in:
@@ -1,16 +1,39 @@
|
|||||||
# src/features/build_features.py
|
# src/features/build_features.py
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from time_domain_features import FeatureExtractor
|
from time_domain_features import ExtractTimeFeatures
|
||||||
import numpy as np
|
import os
|
||||||
|
import re
|
||||||
|
|
||||||
def build_features(input_file, output_file):
|
# define function, regex pattern for extracting the damage level and test number store in pairs array
|
||||||
data = pd.read_csv(input_file)
|
def extract_numbers(filename):
|
||||||
# Assuming the relevant data is in the first column
|
# Find all occurrences of one or more digits in the filename
|
||||||
extractor = FeatureExtractor(data.iloc[:, 0].values)
|
numbers = re.findall(r'\d+', filename)
|
||||||
features = extractor.features
|
# Convert the list of number strings to integers
|
||||||
|
numbers = [int(num) for num in numbers]
|
||||||
|
# Convert to a tuple and return
|
||||||
|
return print(tuple(numbers))
|
||||||
|
|
||||||
|
def build_features(input_dir, output_dir):
|
||||||
|
all_features = []
|
||||||
|
for nth_damage in os.listdir(input_dir):
|
||||||
|
nth_damage_path = os.path.join(input_dir, nth_damage)
|
||||||
|
if os.path.isdir(nth_damage_path):
|
||||||
|
print(nth_damage)
|
||||||
|
for nth_test in os.listdir(nth_damage_path):
|
||||||
|
nth_test_path = os.path.join(nth_damage_path, nth_test)
|
||||||
|
# print(nth_test_path)
|
||||||
|
features = ExtractTimeFeatures(nth_test_path) # return the one csv file feature in dictionary {}
|
||||||
|
all_features.append(features)
|
||||||
|
|
||||||
|
# Create a DataFrame from the list of dictionaries
|
||||||
|
df = pd.DataFrame(all_features)
|
||||||
|
print(df)
|
||||||
|
# Save the DataFrame to a CSV file in the output directory
|
||||||
|
output_file_path = os.path.join(output_dir, 'combined_features.csv')
|
||||||
|
df.to_csv(output_file_path, index=False)
|
||||||
|
print(f"Features saved to {output_file_path}")
|
||||||
# Save features to a file
|
# Save features to a file
|
||||||
np.savez(output_file, **features)
|
# np.savez(output_file, **features)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import sys
|
import sys
|
||||||
@@ -18,4 +41,4 @@ if __name__ == "__main__":
|
|||||||
output_path = sys.argv[2] # 'data/features/feature_matrix.npz'
|
output_path = sys.argv[2] # 'data/features/feature_matrix.npz'
|
||||||
|
|
||||||
# Assuming only one file for simplicity; adapt as needed
|
# Assuming only one file for simplicity; adapt as needed
|
||||||
build_features(f"{input_path}processed_data.csv", output_path)
|
build_features(input_path, output_path)
|
||||||
|
|||||||
Reference in New Issue
Block a user