fix(docs): The readme.md should belong to raw data since the script is intended to simulate raw data that coming from accelerometer sensors instead of processed data that should be generated by simulating frequency domain data instead.

feat(script): add zero-padding to CSV filenames and change the output generated csv as raw data in raw folder
2024-08-18 10:34:22 +07:00 · 2024-08-17 19:51:42 +07:00
6 changed files with 30 additions and 1275 deletions
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -1,16 +0,0 @@
-{
-  // Use IntelliSense to learn about possible attributes.
-  // Hover to view descriptions of existing attributes.
-  // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
-  "version": "0.2.0",
-  "configurations": [
-    {
-      "name": "Python Debugger: Current File with Arguments",
-      "type": "debugpy",
-      "request": "launch",
-      "program": "${file}",
-      "console": "integratedTerminal",
-      "args": ["data/raw", "data/raw"]
-    }
-  ]
-}
--- a/code/notebooks/03_feature_extraction.ipynb
+++ b/code/notebooks/03_feature_extraction.ipynb
--- a/code/src/features/build_features.py
+++ b/code/src/features/build_features.py
@@ -1,39 +1,16 @@
 # src/features/build_features.py
 import pandas as pd
-from time_domain_features import ExtractTimeFeatures
-import os
-import re
+from time_domain_features import FeatureExtractor
+import numpy as np

-# define function, regex pattern for extracting the damage level and test number store in pairs array
-def extract_numbers(filename):
-    # Find all occurrences of one or more digits in the filename
-    numbers = re.findall(r'\d+', filename)
-    # Convert the list of number strings to integers
-    numbers = [int(num) for num in numbers]
-    # Convert to a tuple and return
-    return print(tuple(numbers))
+def build_features(input_file, output_file):
+    data = pd.read_csv(input_file)
+    # Assuming the relevant data is in the first column
+    extractor = FeatureExtractor(data.iloc[:, 0].values)
+    features = extractor.features

-def build_features(input_dir, output_dir):
-    all_features = []
-    for nth_damage in os.listdir(input_dir):
-        nth_damage_path = os.path.join(input_dir, nth_damage)
-        if os.path.isdir(nth_damage_path):
-            print(nth_damage)
-            for nth_test in os.listdir(nth_damage_path):
-                nth_test_path = os.path.join(nth_damage_path, nth_test)
-                # print(nth_test_path)
-                features = ExtractTimeFeatures(nth_test_path) # return the one csv file feature in dictionary {}
-                all_features.append(features)
-
-    # Create a DataFrame from the list of dictionaries
-    df = pd.DataFrame(all_features)
-    print(df)
-    # Save the DataFrame to a CSV file in the output directory
-    output_file_path = os.path.join(output_dir, 'combined_features.csv')
-    df.to_csv(output_file_path, index=False)
-    print(f"Features saved to {output_file_path}")
    # Save features to a file
-    # np.savez(output_file, **features)
+    np.savez(output_file, **features)

 if __name__ == "__main__":
    import sys
@@ -41,4 +18,4 @@ if __name__ == "__main__":
    output_path = sys.argv[2]  # 'data/features/feature_matrix.npz'
    
    # Assuming only one file for simplicity; adapt as needed
-    build_features(input_path, output_path)
+    build_features(f"{input_path}processed_data.csv", output_path)
--- a/code/src/features/time_domain_features.py
+++ b/code/src/features/time_domain_features.py
@@ -36,13 +36,6 @@ class FeatureExtractor:
            result += f"{feature}: {value:.4f}\n"
        return result

-def ExtractTimeFeatures(object):
-    data = pd.read_csv(object, skiprows=1) # Skip the header row separator char info
-    extractor = FeatureExtractor(data.iloc[:, 1].values) # Assuming the data is in the second column
-    features = extractor.features
-    return features
-    # Save features to a file
-    # np.savez(output_file, **features)
 # Usage
 # Assume you have a CSV file with numerical data in the first column
 # Create an instance of the class and pass the path to your CSV file
--- a/data/processed/README.md
+++ b/data/processed/README.md
@@ -1,8 +1,8 @@
-# Processed Data Directory
+# Raw Data Directory

 ## Overview

-This `data/processed` directory contains structured data that has been processed and formatted for analysis. Each subdirectory within `processed` represents a different level of simulated damage, and each contains multiple test files from experiments conducted under that specific damage scenario.
+This `data/raw` directory contains structured data that has been processed and formatted for analysis. Each subdirectory within `raw` represents a different level of simulated damage, and each contains multiple test files from experiments conducted under that specific damage scenario.

 ## Directory Structure

@@ -12,12 +12,12 @@ The directory is organized as follows:
 data
 └── processed
 ├── DAMAGE_1
-│ ├── D1_TEST1.csv
-│ ├── D1_TEST2.csv
+│   ├── D1_TEST1.csv
+│   ├── D1_TEST2.csv
 │ ...
-│ └── D1_TEST10.csv
+│   └── D1_TEST10.csv
 ├── DAMAGE_2
-│ ├── D2_TEST1.csv
+│   ├── D2_TEST1.csv
 │ ...
 ├── DAMAGE_3
 │ ...
--- a/generate_dummy_data.py
+++ b/generate_dummy_data.py
@@ -13,14 +13,20 @@ processed_path = os.path.join(base_path, "processed")
 os.makedirs(raw_path, exist_ok=True)
 os.makedirs(processed_path, exist_ok=True)

-for damage in range(1, 6):  # 5 Damage levels
-    damage_folder = f"DAMAGE_{damage}"
-    damage_path = os.path.join(processed_path, damage_folder)
+# Define the number of zeros to pad
+num_damages = 5
+num_tests = 10
+damage_pad = len(str(num_damages))
+test_pad = len(str(num_tests))
+
+for damage in range(1, num_damages + 1):  # 5 Damage levels starts from 1
+    damage_folder = f"DAMAGE_{damage:0{damage_pad}}"
+    damage_path = os.path.join(raw_path, damage_folder)
    os.makedirs(damage_path, exist_ok=True)

    for test in range(1, 11):  # 10 Tests per damage level
        # Filename for the CSV
-        csv_filename = f"D{damage}_TEST{test}.csv"
+        csv_filename = f"D{damage:0{damage_pad}}_TEST{test:0{test_pad}}.csv"
        csv_path = os.path.join(damage_path, csv_filename)

        # Generate dummy data
Author	SHA1	Message	Date
nuluh	3860f2cc5b	fix(docs): The readme.md should belong to raw data since the script is intended to simulate raw data that coming from accelerometer sensors instead of processed data that should be generated by simulating frequency domain data instead.	2024-08-18 10:34:22 +07:00
nuluh	553140fe3c	feat(script): add zero-padding to CSV filenames and change the output generated csv as raw data in `raw` folder	2024-08-17 19:51:42 +07:00