fix(docs): The readme.md should belong to raw data since the script is intended to simulate raw data that coming from accelerometer sensors instead of processed data that should be generated by simulating frequency domain data instead.

feat(script): add zero-padding to CSV filenames and change the output generated csv as raw data in raw folder
2024-08-18 10:34:22 +07:00 · 2024-08-17 19:51:42 +07:00
6 changed files with 30 additions and 151 deletions
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -1,16 +0,0 @@
-{
-  // Use IntelliSense to learn about possible attributes.
-  // Hover to view descriptions of existing attributes.
-  // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
-  "version": "0.2.0",
-  "configurations": [
-    {
-      "name": "Python Debugger: Current File with Arguments",
-      "type": "debugpy",
-      "request": "launch",
-      "program": "${file}",
-      "console": "integratedTerminal",
-      "args": ["data/raw", "data/raw"]
-    }
-  ]
-}
--- a/code/notebooks/03_feature_extraction.ipynb
+++ b/code/notebooks/03_feature_extraction.ipynb
@@ -25,7 +25,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -154,7 +154,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
@@ -186,12 +186,12 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "### Print Time-domain Features (Single Mockup Data)"
+    "### Print Time-domain Features"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
@@ -264,7 +264,7 @@
       "0  2.067638  1.917716  0.412307  "
      ]
     },
-     "execution_count": 13,
+     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -272,12 +272,10 @@
   "source": [
    "import pandas as pd\n",
    "import sys\n",
-    "import os\n",
    "# Assuming the src directory is one level up from the notebooks directory\n",
    "sys.path.append('../src/features')\n",
    "from time_domain_features import FeatureExtractor\n",
    "\n",
-    "\n",
    "# Extract features\n",
    "extracted = FeatureExtractor(mock_df['SampleData'])\n",
    "\n",
@@ -285,85 +283,6 @@
    "features = pd.DataFrame(extracted.features, index=[0])\n",
    "features\n"
   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Print Time-domain Features (Multiple CSV Mockup Data)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "import sys\n",
-    "import os\n",
-    "# Assuming the src directory is one level up from the notebooks directory\n",
-    "sys.path.append('../src/features')\n",
-    "from time_domain_features import ExtractTimeFeatures # use wrapper function instead of class for easy use\n",
-    "\n",
-    "def build_features(input_dir):\n",
-    "    all_features = []\n",
-    "    for nth_damage in os.listdir(input_dir):\n",
-    "        nth_damage_path = os.path.join(input_dir, nth_damage)\n",
-    "        if os.path.isdir(nth_damage_path):\n",
-    "            # print(nth_damage)\n",
-    "            for nth_test in os.listdir(nth_damage_path):\n",
-    "                nth_test_path = os.path.join(nth_damage_path, nth_test)\n",
-    "                # print(nth_test_path)\n",
-    "                features = ExtractTimeFeatures(nth_test_path) # return the one csv file feature in dictionary {}\n",
-    "                all_features.append(features)\n",
-    "\n",
-    "    # Create a DataFrame from the list of dictionaries\n",
-    "    df = pd.DataFrame(all_features)\n",
-    "    return df\n",
-    "\n",
-    "data_dir = \"../../data/raw\"\n",
-    "# Extract features\n",
-    "df = build_features(data_dir)\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "<class 'pandas.core.frame.DataFrame'>\n",
-      "RangeIndex: 50 entries, 0 to 49\n",
-      "Data columns (total 14 columns):\n",
-      " #   Column              Non-Null Count  Dtype  \n",
-      "---  ------              --------------  -----  \n",
-      " 0   Mean                50 non-null     float64\n",
-      " 1   Max                 50 non-null     float64\n",
-      " 2   Peak (Pm)           50 non-null     float64\n",
-      " 3   Peak-to-Peak (Pk)   50 non-null     float64\n",
-      " 4   RMS                 50 non-null     float64\n",
-      " 5   Variance            50 non-null     float64\n",
-      " 6   Standard Deviation  50 non-null     float64\n",
-      " 7   Power               50 non-null     float64\n",
-      " 8   Crest Factor        50 non-null     float64\n",
-      " 9   Form Factor         50 non-null     float64\n",
-      " 10  Pulse Indicator     50 non-null     float64\n",
-      " 11  Margin              50 non-null     float64\n",
-      " 12  Kurtosis            50 non-null     float64\n",
-      " 13  Skewness            50 non-null     float64\n",
-      "dtypes: float64(14)\n",
-      "memory usage: 5.6 KB\n"
-     ]
-    }
-   ],
-   "source": [
-    "df.info()"
-   ]
  }
 ],
 "metadata": {
--- a/code/src/features/build_features.py
+++ b/code/src/features/build_features.py
@@ -1,39 +1,16 @@
 # src/features/build_features.py
 import pandas as pd
-from time_domain_features import ExtractTimeFeatures
-import os
-import re
+from time_domain_features import FeatureExtractor
+import numpy as np

-# define function, regex pattern for extracting the damage level and test number store in pairs array
-def extract_numbers(filename):
-    # Find all occurrences of one or more digits in the filename
-    numbers = re.findall(r'\d+', filename)
-    # Convert the list of number strings to integers
-    numbers = [int(num) for num in numbers]
-    # Convert to a tuple and return
-    return print(tuple(numbers))
+def build_features(input_file, output_file):
+    data = pd.read_csv(input_file)
+    # Assuming the relevant data is in the first column
+    extractor = FeatureExtractor(data.iloc[:, 0].values)
+    features = extractor.features

-def build_features(input_dir, output_dir):
-    all_features = []
-    for nth_damage in os.listdir(input_dir):
-        nth_damage_path = os.path.join(input_dir, nth_damage)
-        if os.path.isdir(nth_damage_path):
-            print(nth_damage)
-            for nth_test in os.listdir(nth_damage_path):
-                nth_test_path = os.path.join(nth_damage_path, nth_test)
-                # print(nth_test_path)
-                features = ExtractTimeFeatures(nth_test_path) # return the one csv file feature in dictionary {}
-                all_features.append(features)
-
-    # Create a DataFrame from the list of dictionaries
-    df = pd.DataFrame(all_features)
-    print(df)
-    # Save the DataFrame to a CSV file in the output directory
-    output_file_path = os.path.join(output_dir, 'combined_features.csv')
-    df.to_csv(output_file_path, index=False)
-    print(f"Features saved to {output_file_path}")
    # Save features to a file
-    # np.savez(output_file, **features)
+    np.savez(output_file, **features)

 if __name__ == "__main__":
    import sys
@@ -41,4 +18,4 @@ if __name__ == "__main__":
    output_path = sys.argv[2]  # 'data/features/feature_matrix.npz'
    
    # Assuming only one file for simplicity; adapt as needed
-    build_features(input_path, output_path)
+    build_features(f"{input_path}processed_data.csv", output_path)
--- a/code/src/features/time_domain_features.py
+++ b/code/src/features/time_domain_features.py
@@ -36,13 +36,6 @@ class FeatureExtractor:
            result += f"{feature}: {value:.4f}\n"
        return result

-def ExtractTimeFeatures(object):
-    data = pd.read_csv(object, skiprows=1) # Skip the header row separator char info
-    extractor = FeatureExtractor(data.iloc[:, 1].values) # Assuming the data is in the second column
-    features = extractor.features
-    return features
-    # Save features to a file
-    # np.savez(output_file, **features)
 # Usage
 # Assume you have a CSV file with numerical data in the first column
 # Create an instance of the class and pass the path to your CSV file
--- a/data/processed/README.md
+++ b/data/processed/README.md
@@ -1,8 +1,8 @@
-# Processed Data Directory
+# Raw Data Directory

 ## Overview

-This `data/processed` directory contains structured data that has been processed and formatted for analysis. Each subdirectory within `processed` represents a different level of simulated damage, and each contains multiple test files from experiments conducted under that specific damage scenario.
+This `data/raw` directory contains structured data that has been processed and formatted for analysis. Each subdirectory within `raw` represents a different level of simulated damage, and each contains multiple test files from experiments conducted under that specific damage scenario.

 ## Directory Structure

@@ -12,12 +12,12 @@ The directory is organized as follows:
 data
 └── processed
 ├── DAMAGE_1
-│ ├── D1_TEST1.csv
-│ ├── D1_TEST2.csv
+│   ├── D1_TEST1.csv
+│   ├── D1_TEST2.csv
 │ ...
-│ └── D1_TEST10.csv
+│   └── D1_TEST10.csv
 ├── DAMAGE_2
-│ ├── D2_TEST1.csv
+│   ├── D2_TEST1.csv
 │ ...
 ├── DAMAGE_3
 │ ...
--- a/generate_dummy_data.py
+++ b/generate_dummy_data.py
@@ -13,14 +13,20 @@ processed_path = os.path.join(base_path, "processed")
 os.makedirs(raw_path, exist_ok=True)
 os.makedirs(processed_path, exist_ok=True)

-for damage in range(1, 6):  # 5 Damage levels
-    damage_folder = f"DAMAGE_{damage}"
-    damage_path = os.path.join(processed_path, damage_folder)
+# Define the number of zeros to pad
+num_damages = 5
+num_tests = 10
+damage_pad = len(str(num_damages))
+test_pad = len(str(num_tests))
+
+for damage in range(1, num_damages + 1):  # 5 Damage levels starts from 1
+    damage_folder = f"DAMAGE_{damage:0{damage_pad}}"
+    damage_path = os.path.join(raw_path, damage_folder)
    os.makedirs(damage_path, exist_ok=True)

    for test in range(1, 11):  # 10 Tests per damage level
        # Filename for the CSV
-        csv_filename = f"D{damage}_TEST{test}.csv"
+        csv_filename = f"D{damage:0{damage_pad}}_TEST{test:0{test_pad}}.csv"
        csv_path = os.path.join(damage_path, csv_filename)

        # Generate dummy data
Author	SHA1	Message	Date
nuluh	3860f2cc5b	fix(docs): The readme.md should belong to raw data since the script is intended to simulate raw data that coming from accelerometer sensors instead of processed data that should be generated by simulating frequency domain data instead.	2024-08-18 10:34:22 +07:00
nuluh	553140fe3c	feat(script): add zero-padding to CSV filenames and change the output generated csv as raw data in `raw` folder	2024-08-17 19:51:42 +07:00