Compare commits

...

7 Commits

Author SHA1 Message Date
Panuluh
118c56c12d Merge pull request #13 from nuluh/feature/10-add-labels-column-to-time-domain-feature-extraction-dataframe
feat(notebook): add 'labels' column to feature extraction dataframe
2024-08-26 09:55:46 +07:00
nuluh
79a0f82372 feat(notebook): add 'labels' column to feature extraction dataframe
Implement extraction of 'labels' from directory names and append as a new column in the dataframe during feature extraction. Adapted from the existing `build_features.py` script to enhance data usability in supervised learning models within the Jupyter notebook environment.

Closes #10
2024-08-20 15:28:19 +07:00
Panuluh
c9415c21fa Merge pull request #9 from nuluh/feature/automate-csv-file
Closes #4
2024-08-20 13:01:42 +07:00
nuluh
de902b2a8c feat: Add launch.json for Python debugger configuration
This commit adds a new file, `.vscode/launch.json`, which contains the configuration for launching the Python debugger. The configuration includes the necessary attributes such as the debugger type, request type, program file, console type, and command-line arguments. This configuration allows developers to easily debug Python files in the integrated terminal.
2024-08-20 12:52:48 +07:00
nuluh
57c0e03a4f docs(script): Update time-domain feature extraction to skip header row separator char info 2024-08-20 12:52:48 +07:00
nuluh
8ab934fe1c feat(features): refactor feature extraction to handle multiple files and directories
- Modify `build_features` function to support iterative processing across nested directories, enhancing the system's ability to handle larger datasets and varied input structures.
- Replace direct usage of `FeatureExtractor` class with `ExtractTimeFeatures` function, which now acts as a wrapper to include this class, facilitating streamlined integration and maintenance of feature extraction processes.
- Implement `extract_numbers` function using regex to parse filenames and extract numeric identifiers, used for labels when training with SVM
- Switch output from `.npz` to `.csv` format in `build_features`, offering better compatibility with data analysis tools and readability.
- Update documentation and comments within the code to reflect changes in functionality and usage of the new feature extraction setup.

Closes #4
2024-08-20 12:52:06 +07:00
nuluh
55db5709a9 refactor(script): Add time-domain feature extraction functionality called ExtractTimeFeatures function returning features in {dictionary} that later called in build_features.py. This function will be called for each individual .csv. Each returning value later appended in build_features.py.
This function approach rather than just assigning class ensure the flexibility and enhance maintainability.
2024-08-19 13:20:14 +07:00
4 changed files with 1265 additions and 14 deletions

16
.vscode/launch.json vendored Normal file
View File

@@ -0,0 +1,16 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Python Debugger: Current File with Arguments",
"type": "debugpy",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal",
"args": ["data/raw", "data/raw"]
}
]
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,16 +1,39 @@
# src/features/build_features.py
import pandas as pd
from time_domain_features import FeatureExtractor
import numpy as np
from time_domain_features import ExtractTimeFeatures
import os
import re
def build_features(input_file, output_file):
data = pd.read_csv(input_file)
# Assuming the relevant data is in the first column
extractor = FeatureExtractor(data.iloc[:, 0].values)
features = extractor.features
# define function, regex pattern for extracting the damage level and test number store in pairs array
def extract_numbers(filename):
# Find all occurrences of one or more digits in the filename
numbers = re.findall(r'\d+', filename)
# Convert the list of number strings to integers
numbers = [int(num) for num in numbers]
# Convert to a tuple and return
return print(tuple(numbers))
def build_features(input_dir, output_dir):
all_features = []
for nth_damage in os.listdir(input_dir):
nth_damage_path = os.path.join(input_dir, nth_damage)
if os.path.isdir(nth_damage_path):
print(nth_damage)
for nth_test in os.listdir(nth_damage_path):
nth_test_path = os.path.join(nth_damage_path, nth_test)
# print(nth_test_path)
features = ExtractTimeFeatures(nth_test_path) # return the one csv file feature in dictionary {}
all_features.append(features)
# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(all_features)
print(df)
# Save the DataFrame to a CSV file in the output directory
output_file_path = os.path.join(output_dir, 'combined_features.csv')
df.to_csv(output_file_path, index=False)
print(f"Features saved to {output_file_path}")
# Save features to a file
np.savez(output_file, **features)
# np.savez(output_file, **features)
if __name__ == "__main__":
import sys
@@ -18,4 +41,4 @@ if __name__ == "__main__":
output_path = sys.argv[2] # 'data/features/feature_matrix.npz'
# Assuming only one file for simplicity; adapt as needed
build_features(f"{input_path}processed_data.csv", output_path)
build_features(input_path, output_path)

View File

@@ -36,6 +36,13 @@ class FeatureExtractor:
result += f"{feature}: {value:.4f}\n"
return result
def ExtractTimeFeatures(object):
data = pd.read_csv(object, skiprows=1) # Skip the header row separator char info
extractor = FeatureExtractor(data.iloc[:, 1].values) # Assuming the data is in the second column
features = extractor.features
return features
# Save features to a file
# np.savez(output_file, **features)
# Usage
# Assume you have a CSV file with numerical data in the first column
# Create an instance of the class and pass the path to your CSV file