Merge pull request #9 from nuluh/feature/automate-csv-file

Closes #4
This commit was merged in pull request #9.
This commit is contained in:
Panuluh
2024-08-20 13:01:42 +07:00
committed by GitHub
4 changed files with 141 additions and 14 deletions

16
.vscode/launch.json vendored Normal file
View File

@@ -0,0 +1,16 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Python Debugger: Current File with Arguments",
"type": "debugpy",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal",
"args": ["data/raw", "data/raw"]
}
]
}

View File

@@ -25,7 +25,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": 10,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@@ -154,7 +154,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 13, "execution_count": 12,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -186,12 +186,12 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"### Print Time-domain Features" "### Print Time-domain Features (Single Mockup Data)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 23, "execution_count": 13,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -264,7 +264,7 @@
"0 2.067638 1.917716 0.412307 " "0 2.067638 1.917716 0.412307 "
] ]
}, },
"execution_count": 23, "execution_count": 13,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@@ -272,10 +272,12 @@
"source": [ "source": [
"import pandas as pd\n", "import pandas as pd\n",
"import sys\n", "import sys\n",
"import os\n",
"# Assuming the src directory is one level up from the notebooks directory\n", "# Assuming the src directory is one level up from the notebooks directory\n",
"sys.path.append('../src/features')\n", "sys.path.append('../src/features')\n",
"from time_domain_features import FeatureExtractor\n", "from time_domain_features import FeatureExtractor\n",
"\n", "\n",
"\n",
"# Extract features\n", "# Extract features\n",
"extracted = FeatureExtractor(mock_df['SampleData'])\n", "extracted = FeatureExtractor(mock_df['SampleData'])\n",
"\n", "\n",
@@ -283,6 +285,85 @@
"features = pd.DataFrame(extracted.features, index=[0])\n", "features = pd.DataFrame(extracted.features, index=[0])\n",
"features\n" "features\n"
] ]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Print Time-domain Features (Multiple CSV Mockup Data)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import sys\n",
"import os\n",
"# Assuming the src directory is one level up from the notebooks directory\n",
"sys.path.append('../src/features')\n",
"from time_domain_features import ExtractTimeFeatures # use wrapper function instead of class for easy use\n",
"\n",
"def build_features(input_dir):\n",
" all_features = []\n",
" for nth_damage in os.listdir(input_dir):\n",
" nth_damage_path = os.path.join(input_dir, nth_damage)\n",
" if os.path.isdir(nth_damage_path):\n",
" # print(nth_damage)\n",
" for nth_test in os.listdir(nth_damage_path):\n",
" nth_test_path = os.path.join(nth_damage_path, nth_test)\n",
" # print(nth_test_path)\n",
" features = ExtractTimeFeatures(nth_test_path) # return the one csv file feature in dictionary {}\n",
" all_features.append(features)\n",
"\n",
" # Create a DataFrame from the list of dictionaries\n",
" df = pd.DataFrame(all_features)\n",
" return df\n",
"\n",
"data_dir = \"../../data/raw\"\n",
"# Extract features\n",
"df = build_features(data_dir)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 50 entries, 0 to 49\n",
"Data columns (total 14 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Mean 50 non-null float64\n",
" 1 Max 50 non-null float64\n",
" 2 Peak (Pm) 50 non-null float64\n",
" 3 Peak-to-Peak (Pk) 50 non-null float64\n",
" 4 RMS 50 non-null float64\n",
" 5 Variance 50 non-null float64\n",
" 6 Standard Deviation 50 non-null float64\n",
" 7 Power 50 non-null float64\n",
" 8 Crest Factor 50 non-null float64\n",
" 9 Form Factor 50 non-null float64\n",
" 10 Pulse Indicator 50 non-null float64\n",
" 11 Margin 50 non-null float64\n",
" 12 Kurtosis 50 non-null float64\n",
" 13 Skewness 50 non-null float64\n",
"dtypes: float64(14)\n",
"memory usage: 5.6 KB\n"
]
}
],
"source": [
"df.info()"
]
} }
], ],
"metadata": { "metadata": {

View File

@@ -1,16 +1,39 @@
# src/features/build_features.py # src/features/build_features.py
import pandas as pd import pandas as pd
from time_domain_features import FeatureExtractor from time_domain_features import ExtractTimeFeatures
import numpy as np import os
import re
def build_features(input_file, output_file): # define function, regex pattern for extracting the damage level and test number store in pairs array
data = pd.read_csv(input_file) def extract_numbers(filename):
# Assuming the relevant data is in the first column # Find all occurrences of one or more digits in the filename
extractor = FeatureExtractor(data.iloc[:, 0].values) numbers = re.findall(r'\d+', filename)
features = extractor.features # Convert the list of number strings to integers
numbers = [int(num) for num in numbers]
# Convert to a tuple and return
return print(tuple(numbers))
def build_features(input_dir, output_dir):
all_features = []
for nth_damage in os.listdir(input_dir):
nth_damage_path = os.path.join(input_dir, nth_damage)
if os.path.isdir(nth_damage_path):
print(nth_damage)
for nth_test in os.listdir(nth_damage_path):
nth_test_path = os.path.join(nth_damage_path, nth_test)
# print(nth_test_path)
features = ExtractTimeFeatures(nth_test_path) # return the one csv file feature in dictionary {}
all_features.append(features)
# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(all_features)
print(df)
# Save the DataFrame to a CSV file in the output directory
output_file_path = os.path.join(output_dir, 'combined_features.csv')
df.to_csv(output_file_path, index=False)
print(f"Features saved to {output_file_path}")
# Save features to a file # Save features to a file
np.savez(output_file, **features) # np.savez(output_file, **features)
if __name__ == "__main__": if __name__ == "__main__":
import sys import sys
@@ -18,4 +41,4 @@ if __name__ == "__main__":
output_path = sys.argv[2] # 'data/features/feature_matrix.npz' output_path = sys.argv[2] # 'data/features/feature_matrix.npz'
# Assuming only one file for simplicity; adapt as needed # Assuming only one file for simplicity; adapt as needed
build_features(f"{input_path}processed_data.csv", output_path) build_features(input_path, output_path)

View File

@@ -36,6 +36,13 @@ class FeatureExtractor:
result += f"{feature}: {value:.4f}\n" result += f"{feature}: {value:.4f}\n"
return result return result
def ExtractTimeFeatures(object):
data = pd.read_csv(object, skiprows=1) # Skip the header row separator char info
extractor = FeatureExtractor(data.iloc[:, 1].values) # Assuming the data is in the second column
features = extractor.features
return features
# Save features to a file
# np.savez(output_file, **features)
# Usage # Usage
# Assume you have a CSV file with numerical data in the first column # Assume you have a CSV file with numerical data in the first column
# Create an instance of the class and pass the path to your CSV file # Create an instance of the class and pass the path to your CSV file