chore: Ignore .venv/ directory and update .gitignore due to error numpy error ValueError: numpy.ndarray size changed, may indicate binary incompatibility. by creating venv.

feat(notebook): Normalize the data by calculating the relative value between two sensors. Along with it, MinMaxScaler and StandardScaler are applied and visualize with Seaborn's Pair Plot.
Closes #15
2024-09-01 14:50:24 +07:00 · 2024-09-01 14:50:04 +07:00 · 2024-08-27 10:11:39 +07:00 · 2024-08-27 09:23:44 +07:00 · 2024-08-26 09:55:46 +07:00 · 2024-08-20 15:28:19 +07:00
6 changed files with 740 additions and 32 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,4 @@
 # Ignore CSV files in the data directory and all its subdirectories
 data/**/*.csv
-
+.venv/
 *.pyc
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -0,0 +1,16 @@
 {
  // Use IntelliSense to learn about possible attributes.
  // Hover to view descriptions of existing attributes.
  // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
  "version": "0.2.0",
  "configurations": [
    {
      "name": "Python Debugger: Current File with Arguments",
      "type": "debugpy",
      "request": "launch",
      "program": "${file}",
      "console": "integratedTerminal",
      "args": ["data/raw", "data/raw"]
    }
  ]
 }
--- a/code/notebooks/03_feature_extraction.ipynb
+++ b/code/notebooks/03_feature_extraction.ipynb
--- a/code/src/features/build_features.py
+++ b/code/src/features/build_features.py
@@ -1,16 +1,39 @@
 # src/features/build_features.py
 import pandas as pd
-from time_domain_features import FeatureExtractor
+from time_domain_features import ExtractTimeFeatures
-import numpy as np
+import os
 import re
-def build_features(input_file, output_file):
+# define function, regex pattern for extracting the damage level and test number store in pairs array
-    data = pd.read_csv(input_file)
+def extract_numbers(filename):
-    # Assuming the relevant data is in the first column
+    # Find all occurrences of one or more digits in the filename
-    extractor = FeatureExtractor(data.iloc[:, 0].values)
+    numbers = re.findall(r'\d+', filename)
-    features = extractor.features
+    # Convert the list of number strings to integers
    numbers = [int(num) for num in numbers]
    # Convert to a tuple and return
    return print(tuple(numbers))
 def build_features(input_dir, output_dir):
    all_features = []
    for nth_damage in os.listdir(input_dir):
        nth_damage_path = os.path.join(input_dir, nth_damage)
        if os.path.isdir(nth_damage_path):
            print(nth_damage)
            for nth_test in os.listdir(nth_damage_path):
                nth_test_path = os.path.join(nth_damage_path, nth_test)
                # print(nth_test_path)
                features = ExtractTimeFeatures(nth_test_path) # return the one csv file feature in dictionary {}
                all_features.append(features)
    # Create a DataFrame from the list of dictionaries
    df = pd.DataFrame(all_features)
    print(df)
    # Save the DataFrame to a CSV file in the output directory
    output_file_path = os.path.join(output_dir, 'combined_features.csv')
    df.to_csv(output_file_path, index=False)
    print(f"Features saved to {output_file_path}")
    # Save features to a file
-    np.savez(output_file, **features)
+    # np.savez(output_file, **features)
 if __name__ == "__main__":
    import sys
@@ -18,4 +41,4 @@ if __name__ == "__main__":
    output_path = sys.argv[2]  # 'data/features/feature_matrix.npz'
    # Assuming only one file for simplicity; adapt as needed
-    build_features(f"{input_path}processed_data.csv", output_path)
+    build_features(input_path, output_path)
--- a/code/src/features/time_domain_features.py
+++ b/code/src/features/time_domain_features.py
@@ -36,6 +36,13 @@ class FeatureExtractor:
            result += f"{feature}: {value:.4f}\n"
        return result
 def ExtractTimeFeatures(object):
    data = pd.read_csv(object, skiprows=1) # Skip the header row separator char info
    extractor = FeatureExtractor(data.iloc[:, 1].values) # Assuming the data is in the second column
    features = extractor.features
    return features
    # Save features to a file
    # np.savez(output_file, **features)
 # Usage
 # Assume you have a CSV file with numerical data in the first column
 # Create an instance of the class and pass the path to your CSV file
--- a/generate_dummy_data.py
+++ b/generate_dummy_data.py
@@ -16,8 +16,10 @@ os.makedirs(processed_path, exist_ok=True)
 # Define the number of zeros to pad
 num_damages = 5
 num_tests = 10
 num_sensors = 2
 damage_pad = len(str(num_damages))
 test_pad = len(str(num_tests))
 sensor_pad = len(str(num_sensors))
 for damage in range(1, num_damages + 1):  # 5 Damage levels starts from 1
    damage_folder = f"DAMAGE_{damage:0{damage_pad}}"
@@ -25,23 +27,24 @@ for damage in range(1, num_damages + 1):  # 5 Damage levels starts from 1
    os.makedirs(damage_path, exist_ok=True)
    for test in range(1, 11):  # 10 Tests per damage level
-        # Filename for the CSV
+        for sensor in range(1, 3):  # 2 Sensors per test
-        csv_filename = f"D{damage:0{damage_pad}}_TEST{test:0{test_pad}}.csv"
+            # Filename for the CSV
-        csv_path = os.path.join(damage_path, csv_filename)
+            csv_filename = f"D{damage:0{damage_pad}}_TEST{test:0{test_pad}}_{sensor:0{sensor_pad}}.csv"
            csv_path = os.path.join(damage_path, csv_filename)
-        # Generate dummy data
+            # Generate dummy data
-        num_rows = 10
+            num_rows = 10
-        start_time = datetime.now()
+            start_time = datetime.now()
-        timestamps = [start_time + timedelta(seconds=i*0.0078125) for i in range(num_rows)]
+            timestamps = [start_time + timedelta(seconds=i*0.0078125) for i in range(num_rows)]
-        values = np.random.randn(num_rows)  # Random float values
+            values = np.random.randn(num_rows)  # Random float values
-        # Create DataFrame
+            # Create DataFrame
-        df = pd.DataFrame({
+            df = pd.DataFrame({
-            "Time": timestamps,
+                "Time": timestamps,
-            "Value": values
+                "Value": values
-        })
+            })
-        # Save the CSV file with a custom header
+            # Save the CSV file with a custom header
-        with open(csv_path, 'w') as file:
+            with open(csv_path, 'w') as file:
-            file.write('sep=,\n')  # Writing the separator hint
+                file.write('sep=,\n')  # Writing the separator hint
-            df.to_csv(file, index=False)
+                df.to_csv(file, index=False)
Author	SHA1	Message	Date
nuluh	41086e95ad	chore: Ignore .venv/ directory and update .gitignore due to error numpy error `ValueError: numpy.ndarray size changed, may indicate binary incompatibility.` by creating venv.	2024-09-01 14:50:24 +07:00
nuluh	adde35ed7e	feat(notebook): Normalize the data by calculating the relative value between two sensors. Along with it, MinMaxScaler and StandardScaler are applied and visualize with Seaborn's Pair Plot. Closes #15	2024-09-01 14:50:04 +07:00
nuluh	b2684c23f6	feat(script): Add zero-padding to CSV filenames to include sensors number	2024-08-27 10:11:39 +07:00
Panuluh	8a499a04fb	Merge pull request #17 from nuluh/feature/csv-padding-naming Feature/csv padding naming	2024-08-27 09:23:44 +07:00
Panuluh	118c56c12d	Merge pull request #13 from nuluh/feature/10-add-labels-column-to-time-domain-feature-extraction-dataframe feat(notebook): add 'labels' column to feature extraction dataframe	2024-08-26 09:55:46 +07:00
nuluh	79a0f82372	feat(notebook): add 'labels' column to feature extraction dataframe Implement extraction of 'labels' from directory names and append as a new column in the dataframe during feature extraction. Adapted from the existing `build_features.py` script to enhance data usability in supervised learning models within the Jupyter notebook environment. Closes #10	2024-08-20 15:28:19 +07:00
Panuluh	c9415c21fa	Merge pull request #9 from nuluh/feature/automate-csv-file Closes #4	2024-08-20 13:01:42 +07:00
nuluh	de902b2a8c	feat: Add launch.json for Python debugger configuration This commit adds a new file, `.vscode/launch.json`, which contains the configuration for launching the Python debugger. The configuration includes the necessary attributes such as the debugger type, request type, program file, console type, and command-line arguments. This configuration allows developers to easily debug Python files in the integrated terminal.	2024-08-20 12:52:48 +07:00
nuluh	57c0e03a4f	docs(script): Update time-domain feature extraction to skip header row separator char info	2024-08-20 12:52:48 +07:00
nuluh	8ab934fe1c	feat(features): refactor feature extraction to handle multiple files and directories - Modify `build_features` function to support iterative processing across nested directories, enhancing the system's ability to handle larger datasets and varied input structures. - Replace direct usage of `FeatureExtractor` class with `ExtractTimeFeatures` function, which now acts as a wrapper to include this class, facilitating streamlined integration and maintenance of feature extraction processes. - Implement `extract_numbers` function using regex to parse filenames and extract numeric identifiers, used for labels when training with SVM - Switch output from `.npz` to `.csv` format in `build_features`, offering better compatibility with data analysis tools and readability. - Update documentation and comments within the code to reflect changes in functionality and usage of the new feature extraction setup. Closes #4	2024-08-20 12:52:06 +07:00
nuluh	55db5709a9	refactor(script): Add time-domain feature extraction functionality called `ExtractTimeFeatures` function returning features in {dictionary} that later called in `build_features.py`. This function will be called for each individual .`csv`. Each returning value later appended in `build_features.py`. This function approach rather than just assigning class ensure the flexibility and enhance maintainability.	2024-08-19 13:20:14 +07:00