fix(docs): The readme.md should belong to raw data since the script is intended to simulate raw data that coming from accelerometer sensors instead of processed data that should be generated by simulating frequency domain data instead.

feat(script): add zero-padding to CSV filenames and change the output generated csv as raw data in raw folder
2024-08-18 10:34:22 +07:00 · 2024-08-17 19:51:42 +07:00
6 changed files with 30 additions and 151 deletions
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -1,16 +0,0 @@
 {
  // Use IntelliSense to learn about possible attributes.
  // Hover to view descriptions of existing attributes.
  // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
  "version": "0.2.0",
  "configurations": [
    {
      "name": "Python Debugger: Current File with Arguments",
      "type": "debugpy",
      "request": "launch",
      "program": "${file}",
      "console": "integratedTerminal",
      "args": ["data/raw", "data/raw"]
    }
  ]
 }
--- a/code/notebooks/03_feature_extraction.ipynb
+++ b/code/notebooks/03_feature_extraction.ipynb
@@ -25,7 +25,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -154,7 +154,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
@@ -186,12 +186,12 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "### Print Time-domain Features (Single Mockup Data)"
+    "### Print Time-domain Features"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
@@ -264,7 +264,7 @@
       "0  2.067638  1.917716  0.412307  "
      ]
     },
-     "execution_count": 13,
+     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -272,12 +272,10 @@
   "source": [
    "import pandas as pd\n",
    "import sys\n",
    "import os\n",
    "# Assuming the src directory is one level up from the notebooks directory\n",
    "sys.path.append('../src/features')\n",
    "from time_domain_features import FeatureExtractor\n",
    "\n",
    "\n",
    "# Extract features\n",
    "extracted = FeatureExtractor(mock_df['SampleData'])\n",
    "\n",
@@ -285,85 +283,6 @@
    "features = pd.DataFrame(extracted.features, index=[0])\n",
    "features\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Print Time-domain Features (Multiple CSV Mockup Data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import sys\n",
    "import os\n",
    "# Assuming the src directory is one level up from the notebooks directory\n",
    "sys.path.append('../src/features')\n",
    "from time_domain_features import ExtractTimeFeatures # use wrapper function instead of class for easy use\n",
    "\n",
    "def build_features(input_dir):\n",
    "    all_features = []\n",
    "    for nth_damage in os.listdir(input_dir):\n",
    "        nth_damage_path = os.path.join(input_dir, nth_damage)\n",
    "        if os.path.isdir(nth_damage_path):\n",
    "            # print(nth_damage)\n",
    "            for nth_test in os.listdir(nth_damage_path):\n",
    "                nth_test_path = os.path.join(nth_damage_path, nth_test)\n",
    "                # print(nth_test_path)\n",
    "                features = ExtractTimeFeatures(nth_test_path) # return the one csv file feature in dictionary {}\n",
    "                all_features.append(features)\n",
    "\n",
    "    # Create a DataFrame from the list of dictionaries\n",
    "    df = pd.DataFrame(all_features)\n",
    "    return df\n",
    "\n",
    "data_dir = \"../../data/raw\"\n",
    "# Extract features\n",
    "df = build_features(data_dir)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 50 entries, 0 to 49\n",
      "Data columns (total 14 columns):\n",
      " #   Column              Non-Null Count  Dtype  \n",
      "---  ------              --------------  -----  \n",
      " 0   Mean                50 non-null     float64\n",
      " 1   Max                 50 non-null     float64\n",
      " 2   Peak (Pm)           50 non-null     float64\n",
      " 3   Peak-to-Peak (Pk)   50 non-null     float64\n",
      " 4   RMS                 50 non-null     float64\n",
      " 5   Variance            50 non-null     float64\n",
      " 6   Standard Deviation  50 non-null     float64\n",
      " 7   Power               50 non-null     float64\n",
      " 8   Crest Factor        50 non-null     float64\n",
      " 9   Form Factor         50 non-null     float64\n",
      " 10  Pulse Indicator     50 non-null     float64\n",
      " 11  Margin              50 non-null     float64\n",
      " 12  Kurtosis            50 non-null     float64\n",
      " 13  Skewness            50 non-null     float64\n",
      "dtypes: float64(14)\n",
      "memory usage: 5.6 KB\n"
     ]
    }
   ],
   "source": [
    "df.info()"
   ]
  }
 ],
 "metadata": {
--- a/code/src/features/build_features.py
+++ b/code/src/features/build_features.py
@@ -1,39 +1,16 @@
 # src/features/build_features.py
 import pandas as pd
-from time_domain_features import ExtractTimeFeatures
+from time_domain_features import FeatureExtractor
-import os
+import numpy as np
 import re
-# define function, regex pattern for extracting the damage level and test number store in pairs array
+def build_features(input_file, output_file):
-def extract_numbers(filename):
+    data = pd.read_csv(input_file)
-    # Find all occurrences of one or more digits in the filename
+    # Assuming the relevant data is in the first column
-    numbers = re.findall(r'\d+', filename)
+    extractor = FeatureExtractor(data.iloc[:, 0].values)
-    # Convert the list of number strings to integers
+    features = extractor.features
    numbers = [int(num) for num in numbers]
    # Convert to a tuple and return
    return print(tuple(numbers))
 def build_features(input_dir, output_dir):
    all_features = []
    for nth_damage in os.listdir(input_dir):
        nth_damage_path = os.path.join(input_dir, nth_damage)
        if os.path.isdir(nth_damage_path):
            print(nth_damage)
            for nth_test in os.listdir(nth_damage_path):
                nth_test_path = os.path.join(nth_damage_path, nth_test)
                # print(nth_test_path)
                features = ExtractTimeFeatures(nth_test_path) # return the one csv file feature in dictionary {}
                all_features.append(features)
    # Create a DataFrame from the list of dictionaries
    df = pd.DataFrame(all_features)
    print(df)
    # Save the DataFrame to a CSV file in the output directory
    output_file_path = os.path.join(output_dir, 'combined_features.csv')
    df.to_csv(output_file_path, index=False)
    print(f"Features saved to {output_file_path}")
    # Save features to a file
-    # np.savez(output_file, **features)
+    np.savez(output_file, **features)
 if __name__ == "__main__":
    import sys
@@ -41,4 +18,4 @@ if __name__ == "__main__":
    output_path = sys.argv[2]  # 'data/features/feature_matrix.npz'
    # Assuming only one file for simplicity; adapt as needed
-    build_features(input_path, output_path)
+    build_features(f"{input_path}processed_data.csv", output_path)
--- a/code/src/features/time_domain_features.py
+++ b/code/src/features/time_domain_features.py
@@ -36,13 +36,6 @@ class FeatureExtractor:
            result += f"{feature}: {value:.4f}\n"
        return result
 def ExtractTimeFeatures(object):
    data = pd.read_csv(object, skiprows=1) # Skip the header row separator char info
    extractor = FeatureExtractor(data.iloc[:, 1].values) # Assuming the data is in the second column
    features = extractor.features
    return features
    # Save features to a file
    # np.savez(output_file, **features)
 # Usage
 # Assume you have a CSV file with numerical data in the first column
 # Create an instance of the class and pass the path to your CSV file
--- a/data/processed/README.md
+++ b/data/processed/README.md
@@ -1,8 +1,8 @@
-# Processed Data Directory
+# Raw Data Directory
 ## Overview
-This `data/processed` directory contains structured data that has been processed and formatted for analysis. Each subdirectory within `processed` represents a different level of simulated damage, and each contains multiple test files from experiments conducted under that specific damage scenario.
+This `data/raw` directory contains structured data that has been processed and formatted for analysis. Each subdirectory within `raw` represents a different level of simulated damage, and each contains multiple test files from experiments conducted under that specific damage scenario.
 ## Directory Structure
--- a/generate_dummy_data.py
+++ b/generate_dummy_data.py
@@ -13,14 +13,20 @@ processed_path = os.path.join(base_path, "processed")
 os.makedirs(raw_path, exist_ok=True)
 os.makedirs(processed_path, exist_ok=True)
-for damage in range(1, 6):  # 5 Damage levels
+# Define the number of zeros to pad
-    damage_folder = f"DAMAGE_{damage}"
+num_damages = 5
-    damage_path = os.path.join(processed_path, damage_folder)
+num_tests = 10
 damage_pad = len(str(num_damages))
 test_pad = len(str(num_tests))
 for damage in range(1, num_damages + 1):  # 5 Damage levels starts from 1
    damage_folder = f"DAMAGE_{damage:0{damage_pad}}"
    damage_path = os.path.join(raw_path, damage_folder)
    os.makedirs(damage_path, exist_ok=True)
    for test in range(1, 11):  # 10 Tests per damage level
        # Filename for the CSV
-        csv_filename = f"D{damage}_TEST{test}.csv"
+        csv_filename = f"D{damage:0{damage_pad}}_TEST{test:0{test_pad}}.csv"
        csv_path = os.path.join(damage_path, csv_filename)
        # Generate dummy data
Author	SHA1	Message	Date
nuluh	3860f2cc5b	fix(docs): The readme.md should belong to raw data since the script is intended to simulate raw data that coming from accelerometer sensors instead of processed data that should be generated by simulating frequency domain data instead.	2024-08-18 10:34:22 +07:00
nuluh	553140fe3c	feat(script): add zero-padding to CSV filenames and change the output generated csv as raw data in `raw` folder	2024-08-17 19:51:42 +07:00