feat: Add absolute value option to time feature extraction

feat(notebooks): Implement Time-domain feature extraction with real data from QUGS
fix(script): Fix bugs taking incorrect column by changing columns and sensor_end_map index number to take the loop of enumeration.
2024-09-03 15:39:44 +07:00 · 2024-09-03 12:52:40 +07:00 · 2024-09-03 12:08:53 +07:00 · 2024-09-03 11:50:44 +07:00 · 2024-09-03 11:43:46 +07:00 · 2024-09-03 11:38:49 +07:00
7 changed files with 2171 additions and 32 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,4 @@
 # Ignore CSV files in the data directory and all its subdirectories
 data/**/*.csv
-
+.venv/
 *.pyc
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -0,0 +1,16 @@
+{
+  // Use IntelliSense to learn about possible attributes.
+  // Hover to view descriptions of existing attributes.
+  // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+  "version": "0.2.0",
+  "configurations": [
+    {
+      "name": "Python Debugger: Current File with Arguments",
+      "type": "debugpy",
+      "request": "launch",
+      "program": "${file}",
+      "console": "integratedTerminal",
+      "args": ["data/raw", "data/raw"]
+    }
+  ]
+}
--- a/code/notebooks/03_feature_extraction.ipynb
+++ b/code/notebooks/03_feature_extraction.ipynb
--- a/code/src/features/build_features.py
+++ b/code/src/features/build_features.py
@@ -1,16 +1,39 @@
 # src/features/build_features.py
 import pandas as pd
-from time_domain_features import FeatureExtractor
-import numpy as np
+from time_domain_features import ExtractTimeFeatures
+import os
+import re

-def build_features(input_file, output_file):
-    data = pd.read_csv(input_file)
-    # Assuming the relevant data is in the first column
-    extractor = FeatureExtractor(data.iloc[:, 0].values)
-    features = extractor.features
+# define function, regex pattern for extracting the damage level and test number store in pairs array
+def extract_numbers(filename):
+    # Find all occurrences of one or more digits in the filename
+    numbers = re.findall(r'\d+', filename)
+    # Convert the list of number strings to integers
+    numbers = [int(num) for num in numbers]
+    # Convert to a tuple and return
+    return print(tuple(numbers))

+def build_features(input_dir, output_dir):
+    all_features = []
+    for nth_damage in os.listdir(input_dir):
+        nth_damage_path = os.path.join(input_dir, nth_damage)
+        if os.path.isdir(nth_damage_path):
+            print(nth_damage)
+            for nth_test in os.listdir(nth_damage_path):
+                nth_test_path = os.path.join(nth_damage_path, nth_test)
+                # print(nth_test_path)
+                features = ExtractTimeFeatures(nth_test_path) # return the one csv file feature in dictionary {}
+                all_features.append(features)
+
+    # Create a DataFrame from the list of dictionaries
+    df = pd.DataFrame(all_features)
+    print(df)
+    # Save the DataFrame to a CSV file in the output directory
+    output_file_path = os.path.join(output_dir, 'combined_features.csv')
+    df.to_csv(output_file_path, index=False)
+    print(f"Features saved to {output_file_path}")
    # Save features to a file
-    np.savez(output_file, **features)
+    # np.savez(output_file, **features)

 if __name__ == "__main__":
    import sys
@@ -18,4 +41,4 @@ if __name__ == "__main__":
    output_path = sys.argv[2]  # 'data/features/feature_matrix.npz'
    
    # Assuming only one file for simplicity; adapt as needed
-    build_features(f"{input_path}processed_data.csv", output_path)
+    build_features(input_path, output_path)
--- a/code/src/features/time_domain_features.py
+++ b/code/src/features/time_domain_features.py
@@ -36,6 +36,16 @@ class FeatureExtractor:
            result += f"{feature}: {value:.4f}\n"
        return result

+def ExtractTimeFeatures(object, absolute):
+    data = pd.read_csv(object, skiprows=1) # Skip the header row separator char info
+    if absolute:
+        extractor = FeatureExtractor(np.abs(data.iloc[:, 1].values)) # Assuming the data is in the second column
+    else:
+        extractor = FeatureExtractor(data.iloc[:, 1].values)
+    features = extractor.features
+    return features
+    # Save features to a file
+    # np.savez(output_file, **features)
 # Usage
 # Assume you have a CSV file with numerical data in the first column
 # Create an instance of the class and pass the path to your CSV file
--- a/data/QUGS/convert.py
+++ b/data/QUGS/convert.py
@@ -0,0 +1,65 @@
+import pandas as pd
+import os
+import sys
+from colorama import Fore, Style, init
+
+def create_damage_files(base_path, output_base):
+    # Initialize colorama
+    init(autoreset=True)
+    
+    # Generate column labels based on expected duplication in input files
+    columns = ['Real'] + [f'Real.{i}' for i in range(1, 30)]  # Explicitly setting column names
+
+    sensor_end_map = {1: 'Real.25', 2: 'Real.26', 3: 'Real.27', 4: 'Real.28', 5: 'Real.29'}
+
+    # Define the damage scenarios and the corresponding original file indices
+    damage_scenarios = {
+        1: range(6, 11),  # Damage 1 files from zzzAD6.csv to zzzAD10.csv
+        2: range(11, 16), # Damage 2 files from zzzAD11.csv to zzzAD15.csvs
+        3: range(16, 21), # Damage 3 files from zzzAD16.csv to zzzAD20.csv
+        4: range(21, 26)  # Damage 4 files from zzzAD21.csv to zzzAD25.csv
+    }
+    damage_pad = len(str(len(damage_scenarios)))
+    test_pad = len(str(30))
+
+    for damage, files in damage_scenarios.items():
+        for i, file_index in enumerate(files, start=1):
+            # Load original data file
+            file_path = os.path.join(base_path, f'zzzAD{file_index}.TXT')
+            df = pd.read_csv(file_path, sep='\t', skiprows=10)  # Read with explicit column names
+
+            top_sensor = columns[i-1]
+            print(top_sensor, type(top_sensor))
+            output_file_1 = os.path.join(output_base, f'DAMAGE_{damage}', f'D{damage:0{damage_pad}}_TEST{i:0{test_pad}}_01.csv')
+            print(f"Creating {output_file_1} from taking zzzAD{file_index}.TXT")
+            print("Taking datetime column on index 0...")
+            print(f"Taking `{top_sensor}`...")
+            df[['Time', top_sensor]].to_csv(output_file_1, index=False)
+            print(Fore.GREEN + "Done")
+
+            bottom_sensor = sensor_end_map[i]
+            output_file_2 = os.path.join(output_base, f'DAMAGE_{damage}', f'D{damage}_TEST{i}_02.csv')
+            print(f"Creating {output_file_2} from taking zzzAD{file_index}.TXT")
+            print("Taking datetime column on index 0...")
+            print(f"Taking `{bottom_sensor}`...")
+            df[['Time', bottom_sensor]].to_csv(output_file_2, index=False)
+            print(Fore.GREEN + "Done")
+            print("---")
+
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: python convert.py <path_to_csv_files>")
+        sys.exit(1)
+
+    base_path = sys.argv[1]
+    output_base = sys.argv[2]  # Define output directory
+
+    # Create output folders if they don't exist
+    for i in range(1, 5):
+        os.makedirs(os.path.join(output_base, f'DAMAGE_{i}'), exist_ok=True)
+
+    create_damage_files(base_path, output_base)
+    print(Fore.YELLOW + Style.BRIGHT + "All files have been created successfully.")
+
+if __name__ == "__main__":
+    main()
--- a/generate_dummy_data.py
+++ b/generate_dummy_data.py
@@ -16,8 +16,10 @@ os.makedirs(processed_path, exist_ok=True)
 # Define the number of zeros to pad
 num_damages = 5
 num_tests = 10
+num_sensors = 2
 damage_pad = len(str(num_damages))
 test_pad = len(str(num_tests))
+sensor_pad = len(str(num_sensors))

 for damage in range(1, num_damages + 1):  # 5 Damage levels starts from 1
    damage_folder = f"DAMAGE_{damage:0{damage_pad}}"
@@ -25,8 +27,9 @@ for damage in range(1, num_damages + 1):  # 5 Damage levels starts from 1
    os.makedirs(damage_path, exist_ok=True)

    for test in range(1, 11):  # 10 Tests per damage level
+        for sensor in range(1, 3):  # 2 Sensors per test
            # Filename for the CSV
-        csv_filename = f"D{damage:0{damage_pad}}_TEST{test:0{test_pad}}.csv"
+            csv_filename = f"D{damage:0{damage_pad}}_TEST{test:0{test_pad}}_{sensor:0{sensor_pad}}.csv"
            csv_path = os.path.join(damage_path, csv_filename)
            
            # Generate dummy data
Author	SHA1	Message	Date
nuluh	2f54e91197	feat: Add absolute value option to time feature extraction	2024-09-03 15:39:44 +07:00
nuluh	758255a24e	feat(notebooks): Implement Time-domain feature extraction with real data from QUGS	2024-09-03 12:52:40 +07:00
nuluh	ff5578652f	fix(script): Fix bugs taking incorrect column by changing `columns` and `sensor_end_map` index number to take the loop of enumeration.	2024-09-03 12:08:53 +07:00
nuluh	db2c5d3a4e	feat(script): Update output directory in convert.py	2024-09-03 11:50:44 +07:00
nuluh	ea978de872	-	2024-09-03 11:43:46 +07:00
nuluh	465d257850	feat(script): Add zero-padding to converted CSV filenames for standardize processing pipeline	2024-09-03 11:38:49 +07:00
nuluh	d12eea0acf	feat(data-processing): Implement CSV data transformation for SVM analysis Introduce a Python script for transforming QUGS 2D grid structure data into a simplified 1D beam format suitable for SVM-based damage detection. The script efficiently slices original CSV files into smaller, manageable sets, correlating specific damage scenarios with their corresponding sensor data. This change addresses the challenge of retaining critical damage localization information during the data conversion process, ensuring high-quality, relevant data for 1D analysis. Closes #20	2024-09-03 11:33:23 +07:00
nuluh	0306f28a68	docs(notebooks): add `extract_numbers` docstring	2024-09-03 11:09:47 +07:00
Panuluh	9da3dae709	Merge pull request #18 from nuluh/feature/15-normalize-dataset-by-preprocess-relatives-value-between-two-acceloremeter-sensors Feature/15 normalize dataset by preprocess relatives value between two acceloremeter sensors	2024-09-03 08:43:44 +07:00
nuluh	41086e95ad	chore: Ignore .venv/ directory and update .gitignore due to error numpy error `ValueError: numpy.ndarray size changed, may indicate binary incompatibility.` by creating venv.	2024-09-01 14:50:24 +07:00
nuluh	adde35ed7e	feat(notebook): Normalize the data by calculating the relative value between two sensors. Along with it, MinMaxScaler and StandardScaler are applied and visualize with Seaborn's Pair Plot. Closes #15	2024-09-01 14:50:04 +07:00
nuluh	b2684c23f6	feat(script): Add zero-padding to CSV filenames to include sensors number	2024-08-27 10:11:39 +07:00
Panuluh	8a499a04fb	Merge pull request #17 from nuluh/feature/csv-padding-naming Feature/csv padding naming	2024-08-27 09:23:44 +07:00
Panuluh	118c56c12d	Merge pull request #13 from nuluh/feature/10-add-labels-column-to-time-domain-feature-extraction-dataframe feat(notebook): add 'labels' column to feature extraction dataframe	2024-08-26 09:55:46 +07:00
nuluh	79a0f82372	feat(notebook): add 'labels' column to feature extraction dataframe Implement extraction of 'labels' from directory names and append as a new column in the dataframe during feature extraction. Adapted from the existing `build_features.py` script to enhance data usability in supervised learning models within the Jupyter notebook environment. Closes #10	2024-08-20 15:28:19 +07:00
Panuluh	c9415c21fa	Merge pull request #9 from nuluh/feature/automate-csv-file Closes #4	2024-08-20 13:01:42 +07:00
nuluh	de902b2a8c	feat: Add launch.json for Python debugger configuration This commit adds a new file, `.vscode/launch.json`, which contains the configuration for launching the Python debugger. The configuration includes the necessary attributes such as the debugger type, request type, program file, console type, and command-line arguments. This configuration allows developers to easily debug Python files in the integrated terminal.	2024-08-20 12:52:48 +07:00
nuluh	57c0e03a4f	docs(script): Update time-domain feature extraction to skip header row separator char info	2024-08-20 12:52:48 +07:00
nuluh	8ab934fe1c	feat(features): refactor feature extraction to handle multiple files and directories - Modify `build_features` function to support iterative processing across nested directories, enhancing the system's ability to handle larger datasets and varied input structures. - Replace direct usage of `FeatureExtractor` class with `ExtractTimeFeatures` function, which now acts as a wrapper to include this class, facilitating streamlined integration and maintenance of feature extraction processes. - Implement `extract_numbers` function using regex to parse filenames and extract numeric identifiers, used for labels when training with SVM - Switch output from `.npz` to `.csv` format in `build_features`, offering better compatibility with data analysis tools and readability. - Update documentation and comments within the code to reflect changes in functionality and usage of the new feature extraction setup. Closes #4	2024-08-20 12:52:06 +07:00
nuluh	55db5709a9	refactor(script): Add time-domain feature extraction functionality called `ExtractTimeFeatures` function returning features in {dictionary} that later called in `build_features.py`. This function will be called for each individual .`csv`. Each returning value later appended in `build_features.py`. This function approach rather than just assigning class ensure the flexibility and enhance maintainability.	2024-08-19 13:20:14 +07:00