From 55db5709a9678d87724ab0e85105e6d745be26a4 Mon Sep 17 00:00:00 2001 From: nuluh Date: Mon, 19 Aug 2024 13:20:14 +0700 Subject: [PATCH 1/4] refactor(script): Add time-domain feature extraction functionality called `ExtractTimeFeatures` function returning features in {dictionary} that later called in `build_features.py`. This function will be called for each individual .`csv`. Each returning value later appended in `build_features.py`. This function approach rather than just assigning class ensure the flexibility and enhance maintainability. --- code/src/features/time_domain_features.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/code/src/features/time_domain_features.py b/code/src/features/time_domain_features.py index d64566c..d37b061 100644 --- a/code/src/features/time_domain_features.py +++ b/code/src/features/time_domain_features.py @@ -36,6 +36,13 @@ class FeatureExtractor: result += f"{feature}: {value:.4f}\n" return result +def ExtractTimeFeatures(object): + data = pd.read_csv(object, skiprows=1) + extractor = FeatureExtractor(data.iloc[:, 1].values) # Assuming the data is in the second column + features = extractor.features + return features + # Save features to a file + # np.savez(output_file, **features) # Usage # Assume you have a CSV file with numerical data in the first column # Create an instance of the class and pass the path to your CSV file From 8ab934fe1c08ec274a80329bf170044ca4905846 Mon Sep 17 00:00:00 2001 From: nuluh Date: Tue, 20 Aug 2024 11:27:02 +0700 Subject: [PATCH 2/4] feat(features): refactor feature extraction to handle multiple files and directories - Modify `build_features` function to support iterative processing across nested directories, enhancing the system's ability to handle larger datasets and varied input structures. - Replace direct usage of `FeatureExtractor` class with `ExtractTimeFeatures` function, which now acts as a wrapper to include this class, facilitating streamlined integration and maintenance of feature extraction processes. - Implement `extract_numbers` function using regex to parse filenames and extract numeric identifiers, used for labels when training with SVM - Switch output from `.npz` to `.csv` format in `build_features`, offering better compatibility with data analysis tools and readability. - Update documentation and comments within the code to reflect changes in functionality and usage of the new feature extraction setup. Closes #4 --- code/src/features/build_features.py | 41 ++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/code/src/features/build_features.py b/code/src/features/build_features.py index 017f26c..8ecd03c 100644 --- a/code/src/features/build_features.py +++ b/code/src/features/build_features.py @@ -1,16 +1,39 @@ # src/features/build_features.py import pandas as pd -from time_domain_features import FeatureExtractor -import numpy as np +from time_domain_features import ExtractTimeFeatures +import os +import re -def build_features(input_file, output_file): - data = pd.read_csv(input_file) - # Assuming the relevant data is in the first column - extractor = FeatureExtractor(data.iloc[:, 0].values) - features = extractor.features +# define function, regex pattern for extracting the damage level and test number store in pairs array +def extract_numbers(filename): + # Find all occurrences of one or more digits in the filename + numbers = re.findall(r'\d+', filename) + # Convert the list of number strings to integers + numbers = [int(num) for num in numbers] + # Convert to a tuple and return + return print(tuple(numbers)) +def build_features(input_dir, output_dir): + all_features = [] + for nth_damage in os.listdir(input_dir): + nth_damage_path = os.path.join(input_dir, nth_damage) + if os.path.isdir(nth_damage_path): + print(nth_damage) + for nth_test in os.listdir(nth_damage_path): + nth_test_path = os.path.join(nth_damage_path, nth_test) + # print(nth_test_path) + features = ExtractTimeFeatures(nth_test_path) # return the one csv file feature in dictionary {} + all_features.append(features) + + # Create a DataFrame from the list of dictionaries + df = pd.DataFrame(all_features) + print(df) + # Save the DataFrame to a CSV file in the output directory + output_file_path = os.path.join(output_dir, 'combined_features.csv') + df.to_csv(output_file_path, index=False) + print(f"Features saved to {output_file_path}") # Save features to a file - np.savez(output_file, **features) + # np.savez(output_file, **features) if __name__ == "__main__": import sys @@ -18,4 +41,4 @@ if __name__ == "__main__": output_path = sys.argv[2] # 'data/features/feature_matrix.npz' # Assuming only one file for simplicity; adapt as needed - build_features(f"{input_path}processed_data.csv", output_path) + build_features(input_path, output_path) From 57c0e03a4f7dce77730963b7225e6df3ce5deeee Mon Sep 17 00:00:00 2001 From: nuluh Date: Tue, 20 Aug 2024 11:31:24 +0700 Subject: [PATCH 3/4] docs(script): Update time-domain feature extraction to skip header row separator char info --- code/src/features/time_domain_features.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/src/features/time_domain_features.py b/code/src/features/time_domain_features.py index d37b061..1ef4ace 100644 --- a/code/src/features/time_domain_features.py +++ b/code/src/features/time_domain_features.py @@ -37,7 +37,7 @@ class FeatureExtractor: return result def ExtractTimeFeatures(object): - data = pd.read_csv(object, skiprows=1) + data = pd.read_csv(object, skiprows=1) # Skip the header row separator char info extractor = FeatureExtractor(data.iloc[:, 1].values) # Assuming the data is in the second column features = extractor.features return features From de902b2a8ce9ff73ec73b748ff508554801759ad Mon Sep 17 00:00:00 2001 From: nuluh Date: Tue, 20 Aug 2024 11:32:22 +0700 Subject: [PATCH 4/4] feat: Add launch.json for Python debugger configuration This commit adds a new file, `.vscode/launch.json`, which contains the configuration for launching the Python debugger. The configuration includes the necessary attributes such as the debugger type, request type, program file, console type, and command-line arguments. This configuration allows developers to easily debug Python files in the integrated terminal. --- .vscode/launch.json | 16 ++++ code/notebooks/03_feature_extraction.ipynb | 91 ++++++++++++++++++++-- 2 files changed, 102 insertions(+), 5 deletions(-) create mode 100644 .vscode/launch.json diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..fef1db9 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,16 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Python Debugger: Current File with Arguments", + "type": "debugpy", + "request": "launch", + "program": "${file}", + "console": "integratedTerminal", + "args": ["data/raw", "data/raw"] + } + ] +} diff --git a/code/notebooks/03_feature_extraction.ipynb b/code/notebooks/03_feature_extraction.ipynb index b0de9e1..9f54286 100644 --- a/code/notebooks/03_feature_extraction.ipynb +++ b/code/notebooks/03_feature_extraction.ipynb @@ -25,7 +25,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -154,7 +154,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -186,12 +186,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Print Time-domain Features" + "### Print Time-domain Features (Single Mockup Data)" ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -264,7 +264,7 @@ "0 2.067638 1.917716 0.412307 " ] }, - "execution_count": 23, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -272,10 +272,12 @@ "source": [ "import pandas as pd\n", "import sys\n", + "import os\n", "# Assuming the src directory is one level up from the notebooks directory\n", "sys.path.append('../src/features')\n", "from time_domain_features import FeatureExtractor\n", "\n", + "\n", "# Extract features\n", "extracted = FeatureExtractor(mock_df['SampleData'])\n", "\n", @@ -283,6 +285,85 @@ "features = pd.DataFrame(extracted.features, index=[0])\n", "features\n" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Print Time-domain Features (Multiple CSV Mockup Data)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import sys\n", + "import os\n", + "# Assuming the src directory is one level up from the notebooks directory\n", + "sys.path.append('../src/features')\n", + "from time_domain_features import ExtractTimeFeatures # use wrapper function instead of class for easy use\n", + "\n", + "def build_features(input_dir):\n", + " all_features = []\n", + " for nth_damage in os.listdir(input_dir):\n", + " nth_damage_path = os.path.join(input_dir, nth_damage)\n", + " if os.path.isdir(nth_damage_path):\n", + " # print(nth_damage)\n", + " for nth_test in os.listdir(nth_damage_path):\n", + " nth_test_path = os.path.join(nth_damage_path, nth_test)\n", + " # print(nth_test_path)\n", + " features = ExtractTimeFeatures(nth_test_path) # return the one csv file feature in dictionary {}\n", + " all_features.append(features)\n", + "\n", + " # Create a DataFrame from the list of dictionaries\n", + " df = pd.DataFrame(all_features)\n", + " return df\n", + "\n", + "data_dir = \"../../data/raw\"\n", + "# Extract features\n", + "df = build_features(data_dir)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 50 entries, 0 to 49\n", + "Data columns (total 14 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Mean 50 non-null float64\n", + " 1 Max 50 non-null float64\n", + " 2 Peak (Pm) 50 non-null float64\n", + " 3 Peak-to-Peak (Pk) 50 non-null float64\n", + " 4 RMS 50 non-null float64\n", + " 5 Variance 50 non-null float64\n", + " 6 Standard Deviation 50 non-null float64\n", + " 7 Power 50 non-null float64\n", + " 8 Crest Factor 50 non-null float64\n", + " 9 Form Factor 50 non-null float64\n", + " 10 Pulse Indicator 50 non-null float64\n", + " 11 Margin 50 non-null float64\n", + " 12 Kurtosis 50 non-null float64\n", + " 13 Skewness 50 non-null float64\n", + "dtypes: float64(14)\n", + "memory usage: 5.6 KB\n" + ] + } + ], + "source": [ + "df.info()" + ] } ], "metadata": {