refactor(test): update test script to generate damage files index for dataset_B and adjust export path for processed data

fix(data): fix the incorrect output of scipy.stft() data to be pandas.DataFrame shaped (513,513) along with its frequencies as the index and times as the columns (transposed) instead of just the magnitude that being flattened out; add checks for empty data and correct file paths for sensor data loading.
Closes #43
2025-04-20 16:02:16 +07:00 · 2025-04-20 14:45:38 +07:00 · 2025-04-17 10:10:19 +07:00 · 2025-03-22 19:57:20 +07:00 · 2025-03-22 19:48:50 +07:00 · 2025-03-21 15:58:50 +07:00
3 changed files with 360 additions and 33 deletions
--- a/code/notebooks/stft.ipynb
+++ b/code/notebooks/stft.ipynb
@@ -121,8 +121,9 @@
    "signal_sensor2_test1 = []\n",
    "\n",
    "for data in df:\n",
-    "    signal_sensor1_test1.append(data['sensor 1'].values)\n",
-    "    signal_sensor2_test1.append(data['sensor 2'].values)\n",
+    "    if not data.empty and 'sensor 1' in data.columns and 'sensor 2' in data.columns:\n",
+    "        signal_sensor1_test1.append(data['sensor 1'].values)\n",
+    "        signal_sensor2_test1.append(data['sensor 2'].values)\n",
    "\n",
    "print(len(signal_sensor1_test1))\n",
    "print(len(signal_sensor2_test1))"
@@ -156,8 +157,6 @@
    "from scipy.signal import stft, hann\n",
    "from multiprocessing import Pool\n",
    "\n",
-    "\n",
-    "\n",
    "# Function to compute and append STFT data\n",
    "def process_stft(args):\n",
    "    # Define STFT parameters\n",
@@ -199,23 +198,22 @@
    "    # Compute STFT\n",
    "    frequencies, times, Zxx = stft(sensor_data, fs=Fs, window=window, nperseg=window_size, noverlap=window_size - hop_size)\n",
    "    magnitude = np.abs(Zxx)\n",
-    "    flattened_stft = magnitude.flatten()\n",
+    "    df_stft = pd.DataFrame(magnitude, index=frequencies, columns=times).T\n",
+    "    df_stft.columns = [f\"Freq_{i}\" for i in frequencies]\n",
    "    \n",
    "    # Define the output CSV file path\n",
    "    stft_file_name = f'stft_data{sensor_num}_{damage_num}.csv'\n",
    "    sensor_output_dir = os.path.join(damage_base_path, sensor_name.lower())\n",
    "    os.makedirs(sensor_output_dir, exist_ok=True)\n",
    "    stft_file_path = os.path.join(sensor_output_dir, stft_file_name)\n",
-    "    print(stft_file_path)\n",
    "    # Append the flattened STFT to the CSV\n",
    "    try:\n",
-    "        flattened_stft_df = pd.DataFrame([flattened_stft])\n",
    "        if not os.path.isfile(stft_file_path):\n",
    "            # Create a new CSV\n",
-    "            flattened_stft_df.to_csv(stft_file_path, index=False, header=False)\n",
+    "            df_stft.to_csv(stft_file_path, index=False, header=False)\n",
    "        else:\n",
    "            # Append to existing CSV\n",
-    "            flattened_stft_df.to_csv(stft_file_path, mode='a', index=False, header=False)\n",
+    "            df_stft.to_csv(stft_file_path, mode='a', index=False, header=False)\n",
    "        print(f\"Appended STFT data to {stft_file_path}\")\n",
    "    except Exception as e:\n",
    "        print(f\"Error writing to {stft_file_path}: {e}\")"
@@ -295,7 +293,7 @@
    "\n",
    "# get current y ticks in list\n",
    "print(len(frequencies))\n",
-    "print(len(times))\n"
+    "print(len(times))"
   ]
  },
  {
@@ -324,8 +322,8 @@
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "ready_data1 = []\n",
-    "for file in os.listdir('D:/thesis/data/working/sensor1'):\n",
-    "    ready_data1.append(pd.read_csv(os.path.join('D:/thesis/data/working/sensor1', file)))\n",
+    "for file in os.listdir('D:/thesis/data/converted/raw/sensor1'):\n",
+    "    ready_data1.append(pd.read_csv(os.path.join('D:/thesis/data/converted/raw/sensor1', file)))\n",
    "# ready_data1[1]\n",
    "# colormesh give title x is frequency and y is time and rotate/transpose the data\n",
    "# Plotting the STFT Data"
@@ -337,8 +335,8 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "ready_data1[1]\n",
-    "plt.pcolormesh(ready_data1[1])"
+    "# ready_data1[1]\n",
+    "plt.pcolormesh(ready_data1[2])"
   ]
  },
  {
@@ -362,9 +360,8 @@
   "outputs": [],
   "source": [
    "ready_data2 = []\n",
-    "for file in os.listdir('D:/thesis/data/working/sensor2'):\n",
-    "    ready_data2.append(pd.read_csv(os.path.join('D:/thesis/data/working/sensor2', file)))\n",
-    "ready_data2[5]"
+    "for file in os.listdir('D:/thesis/data/converted/raw/sensor2'):\n",
+    "    ready_data2.append(pd.read_csv(os.path.join('D:/thesis/data/converted/raw/sensor2', file)))"
   ]
  },
  {
@@ -384,10 +381,25 @@
   "outputs": [],
   "source": [
    "x1 = 0\n",
-    "\n",
+    "print(type(ready_data1[0]))\n",
+    "ready_data1[0].iloc[:,0]\n",
+    "# x1 = x1 + ready_data1[0].shape[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x1 = 0\n",
+    "print(type(x1))\n",
    "for i in range(len(ready_data1)):\n",
-    "    print(ready_data1[i].shape)\n",
+    "    # print(ready_data1[i].shape)\n",
+    "    # print(ready_data1[i].)\n",
+    "    print(type(ready_data1[i].shape[0]))\n",
    "    x1 = x1 + ready_data1[i].shape[0]\n",
+    "    print(type(x1))\n",
    "\n",
    "print(x1)"
   ]
--- a/data/QUGS/convert.py
+++ b/data/QUGS/convert.py
@@ -1,25 +1,307 @@
 import pandas as pd
 import os
+import re
 import sys
+import numpy as np
 from colorama import Fore, Style, init
+from typing import TypedDict, Dict, List
+from joblib import load
+from pprint import pprint
+
+# class DamageFilesIndices(TypedDict):
+#     damage_index: int
+#     files: list[int]
+OriginalSingleDamageScenarioFilePath = str
+DamageScenarioGroupIndex = int
+OriginalSingleDamageScenario = pd.DataFrame
+SensorIndex = int
+VectorColumnIndex = List[SensorIndex]
+VectorColumnIndices = List[VectorColumnIndex]
+DamageScenarioGroup = List[OriginalSingleDamageScenario]
+GroupDataset = List[DamageScenarioGroup]
+
+
+class DamageFilesIndices(TypedDict):
+    damage_index: int
+    files: List[str]
+
+
+def generate_damage_files_index(**kwargs) -> DamageFilesIndices:
+    prefix: str = kwargs.get("prefix", "zzzAD")
+    extension: str = kwargs.get("extension", ".TXT")
+    num_damage: int = kwargs.get("num_damage")
+    file_index_start: int = kwargs.get("file_index_start")
+    col: int = kwargs.get("col")
+    base_path: str = kwargs.get("base_path")
+
+    damage_scenarios = {}
+    a = file_index_start
+    b = col + 1
+    for i in range(1, num_damage + 1):
+        damage_scenarios[i] = range(a, b)
+        a += col
+        b += col
+
+    # return damage_scenarios
+
+    x = {}
+    for damage, files in damage_scenarios.items():
+        x[damage] = []  # Initialize each key with an empty list
+        for i, file_index in enumerate(files, start=1):
+            if base_path:
+                x[damage].append(
+                    os.path.normpath(
+                        os.path.join(base_path, f"{prefix}{file_index}{extension}")
+                    )
+                )
+                # if not os.path.exists(file_path):
+                #     print(Fore.RED + f"File {file_path} does not exist.")
+                #     continue
+            else:
+                x[damage].append(f"{prefix}{file_index}{extension}")
+    return x
+
+    # file_path = os.path.join(base_path, f"zzz{prefix}D{file_index}.TXT")
+    # df = pd.read_csv( file_path, sep="\t", skiprows=10)  # Read with explicit column names
+
+
+class DataProcessor:
+    def __init__(self, file_index: DamageFilesIndices, cache_path: str = None):
+        self.file_index = file_index
+        if cache_path:
+            self.data = load(cache_path)
+        else:
+            self.data = self._load_all_data()
+
+    def _extract_column_names(self, file_path: str) -> List[str]:
+        """
+        Extracts column names from the header of the given file.
+        Assumes the 6th line contains column names.
+
+        :param file_path: Path to the data file.
+        :return: List of column names.
+        """
+        with open(file_path, "r") as f:
+            header_lines = [next(f) for _ in range(12)]
+
+        # Extract column names from the 6th line
+        channel_line = header_lines[10].strip()
+        tokens = re.findall(r'"([^"]+)"', channel_line)
+        if not channel_line.startswith('"'):
+            first_token = channel_line.split()[0]
+            tokens = [first_token] + tokens
+
+        return tokens  # Prepend 'Time' column if applicable
+
+    def _load_dataframe(self, file_path: str) -> OriginalSingleDamageScenario:
+        """
+        Loads a single data file into a pandas DataFrame.
+
+        :param file_path: Path to the data file.
+        :return: DataFrame containing the numerical data.
+        """
+        col_names = self._extract_column_names(file_path)
+        df = pd.read_csv(
+            file_path, delim_whitespace=True, skiprows=11, header=None, memory_map=True
+        )
+        df.columns = col_names
+        return df
+
+    def _load_all_data(self) -> GroupDataset:
+        """
+        Loads all data files based on the grouping dictionary and returns a nested list.
+
+        :return: A nested list of DataFrames where the outer index corresponds to group_idx - 1.
+        """
+        data = []
+        # Find the maximum group index to determine the list size
+        max_group_idx = max(self.file_index.keys()) if self.file_index else 0
+
+        # Initialize empty lists
+        for _ in range(max_group_idx):
+            data.append([])
+
+        # Fill the list with data
+        for group_idx, file_list in self.file_index.items():
+            # Adjust index to be 0-based
+            list_idx = group_idx - 1
+            data[list_idx] = [self._load_dataframe(file) for file in file_list]
+
+        return data
+
+    def get_group_data(self, group_idx: int) -> List[pd.DataFrame]:
+        """
+        Returns the list of DataFrames for the given group index.
+
+        :param group_idx: Index of the group.
+        :return: List of DataFrames.
+        """
+        return self.data.get([group_idx, []])
+
+    def get_column_names(self, group_idx: int, file_idx: int = 0) -> List[str]:
+        """
+        Returns the column names for the given group and file indices.
+
+        :param group_idx: Index of the group.
+        :param file_idx: Index of the file in the group.
+        :return: List of column names.
+        """
+        if group_idx in self.data and len(self.data[group_idx]) > file_idx:
+            return self.data[group_idx][file_idx].columns.tolist()
+        return []
+
+    def get_data_info(self):
+        """
+        Print information about the loaded data structure.
+        Adapted for when self.data is a List instead of a Dictionary.
+        """
+        if isinstance(self.data, list):
+            # For each sublist in self.data, get the type names of all elements
+            pprint(
+                [
+                    (
+                        [type(item).__name__ for item in sublist]
+                        if isinstance(sublist, list)
+                        else type(sublist).__name__
+                    )
+                    for sublist in self.data
+                ]
+            )
+        else:
+            pprint(
+                {
+                    key: [type(df).__name__ for df in value]
+                    for key, value in self.data.items()
+                }
+                if isinstance(self.data, dict)
+                else type(self.data).__name__
+            )
+
+    def _create_vector_column_index(self) -> VectorColumnIndices:
+        vector_col_idx: VectorColumnIndices = []
+        y = 0
+        for data_group in self.data:  # len(data_group[i]) = 5
+            for j in data_group:  # len(j[i]) =
+                c: VectorColumnIndex = []  # column vector c_{j}
+                x = 0
+                for _ in range(6):  # TODO: range(6) should be dynamic and parameterized
+                    c.append(x + y)
+                    x += 5
+                vector_col_idx.append(c)
+                y += 1
+            return vector_col_idx
+
+    def create_vector_column(self, overwrite=True) -> List[List[List[pd.DataFrame]]]:
+        """
+        Create a vector column from the loaded data.
+
+        :param overwrite: Overwrite the original data with vector column-based data.
+        """
+        idx = self._create_vector_column_index()
+        # if overwrite:
+        for i in range(len(self.data)):
+            for j in range(len(self.data[i])):
+                # Get the appropriate indices for slicing from idx
+                indices = idx[j]
+
+                # Get the current DataFrame
+                df = self.data[i][j]
+
+                # Keep the 'Time' column and select only specified 'Real' columns
+                # First, we add 1 to all indices to account for 'Time' being at position 0
+                real_indices = [index + 1 for index in indices]
+
+                # Create list with Time column index (0) and the adjusted Real indices
+                all_indices = [0] + real_indices
+
+                # Apply the slicing
+                self.data[i][j] = df.iloc[:, all_indices]
+        # TODO: if !overwrite:
+
+    def create_limited_sensor_vector_column(self, overwrite=True):
+        """
+        Create a vector column from the loaded data.
+
+        :param overwrite: Overwrite the original data with vector column-based data.
+        """
+        idx = self._create_vector_column_index()
+        # if overwrite:
+        for i in range(len(self.data)):  # damage(s)
+            for j in range(len(self.data[i])):  # col(s)
+                # Get the appropriate indices for slicing from idx
+                indices = idx[j]
+
+                # Get the current DataFrame
+                df = self.data[i][j]
+
+                # Keep the 'Time' column and select only specifid 'Real' colmns
+                # First, we add 1 to all indices to acount for 'Time' being at positiion 0
+                real_indices = [index + 1 for index in indices]
+
+                # Create list with Time column index (0) and the adjustedd Real indices
+                all_indices = [0] + [real_indices[0]] + [real_indices[-1]]
+
+                # Apply the slicing
+                self.data[i][j] = df.iloc[:, all_indices]
+        # TODO: if !overwrite:
+
+    def export_to_csv(self, output_dir: str, file_prefix: str = "DAMAGE"):
+        """
+        Export the processed data to CSV files in the required folder structure.
+
+        :param output_dir: Directory to save the CSV files.
+        :param file_prefix: Prefix for the output filenames.
+        """
+        for group_idx, group in enumerate(self.data, start=1):
+            group_folder = os.path.join(output_dir, f"{file_prefix}_{group_idx}")
+            os.makedirs(group_folder, exist_ok=True)
+            for test_idx, df in enumerate(group, start=1):
+                # Ensure columns are named uniquely if duplicated
+                df = df.copy()
+                df.columns = ["Time", "Real_0", "Real_1"]  # Rename
+
+                # Export first Real column
+                out1 = os.path.join(
+                    group_folder, f"{file_prefix}_{group_idx}_TEST{test_idx}_01.csv"
+                )
+                df[["Time", "Real_0"]].rename(columns={"Real_0": "Real"}).to_csv(
+                    out1, index=False
+                )
+
+                # Export last Real column
+                out2 = os.path.join(
+                    group_folder, f"{file_prefix}_{group_idx}_TEST{test_idx}_02.csv"
+                )
+                df[["Time", "Real_1"]].rename(columns={"Real_1": "Real"}).to_csv(
+                    out2, index=False
+                )
+

 def create_damage_files(base_path, output_base, prefix):
    # Initialize colorama
    init(autoreset=True)

    # Generate column labels based on expected duplication in input files
-    columns = ['Real'] + [f'Real.{i}' for i in range(1, 30)]  # Explicitly setting column names
+    columns = ["Real"] + [
+        f"Real.{i}" for i in range(1, 30)
+    ]  # Explicitly setting column names

-    sensor_end_map = {1: 'Real.25', 2: 'Real.26', 3: 'Real.27', 4: 'Real.28', 5: 'Real.29'}
+    sensor_end_map = {
+        1: "Real.25",
+        2: "Real.26",
+        3: "Real.27",
+        4: "Real.28",
+        5: "Real.29",
+    }

    # Define the damage scenarios and the corresponding original file indices
    damage_scenarios = {
        1: range(1, 6),  # Damage 1 files from zzzAD1.csv to zzzAD5.csv
        2: range(6, 11),  # Damage 2 files from zzzAD6.csv to zzzAD10.csv
-        3: range(11, 16), # Damage 3 files from zzzAD11.csv to zzzAD15.csvs
-        4: range(16, 21), # Damage 4 files from zzzAD16.csv to zzzAD20.csv
+        3: range(11, 16),  # Damage 3 files from zzzAD11.csv to zzzAD15.csvs
+        4: range(16, 21),  # Damage 4 files from zzzAD16.csv to zzzAD20.csv
        5: range(21, 26),  # Damage 5 files from zzzAD21.csv to zzzAD25.csv
-        6: range(26, 31)  # Damage 6 files from zzzAD26.csv to zzzAD30.csv
+        6: range(26, 31),  # Damage 6 files from zzzAD26.csv to zzzAD30.csv
    }
    damage_pad = len(str(len(damage_scenarios)))
    test_pad = len(str(30))
@@ -27,29 +309,36 @@ def create_damage_files(base_path, output_base, prefix):
    for damage, files in damage_scenarios.items():
        for i, file_index in enumerate(files, start=1):
            # Load original data file
-            file_path = os.path.join(base_path, f'zzz{prefix}D{file_index}.TXT')
-            df = pd.read_csv(file_path, sep='\t', skiprows=10)  # Read with explicit column names
+            file_path = os.path.join(base_path, f"zzz{prefix}D{file_index}.TXT")
+            df = pd.read_csv(
+                file_path, sep="\t", skiprows=10
+            )  # Read with explicit column names

-            top_sensor = columns[i-1]
+            top_sensor = columns[i - 1]
            print(top_sensor, type(top_sensor))
-            output_file_1 = os.path.join(output_base, f'DAMAGE_{damage}', f'DAMAGE{damage}_TEST{i}_01.csv')
+            output_file_1 = os.path.join(
+                output_base, f"DAMAGE_{damage}", f"DAMAGE{damage}_TEST{i}_01.csv"
+            )
            print(f"Creating {output_file_1} from taking zzz{prefix}D{file_index}.TXT")
            print("Taking datetime column on index 0...")
            print(f"Taking `{top_sensor}`...")
            os.makedirs(os.path.dirname(output_file_1), exist_ok=True)
-            df[['Time', top_sensor]].to_csv(output_file_1, index=False)
+            df[["Time", top_sensor]].to_csv(output_file_1, index=False)
            print(Fore.GREEN + "Done")

            bottom_sensor = sensor_end_map[i]
-            output_file_2 = os.path.join(output_base, f'DAMAGE_{damage}', f'DAMAGE{damage}_TEST{i}_02.csv')
+            output_file_2 = os.path.join(
+                output_base, f"DAMAGE_{damage}", f"DAMAGE{damage}_TEST{i}_02.csv"
+            )
            print(f"Creating {output_file_2} from taking zzz{prefix}D{file_index}.TXT")
            print("Taking datetime column on index 0...")
            print(f"Taking `{bottom_sensor}`...")
            os.makedirs(os.path.dirname(output_file_2), exist_ok=True)
-            df[['Time', bottom_sensor]].to_csv(output_file_2, index=False)
+            df[["Time", bottom_sensor]].to_csv(output_file_2, index=False)
            print(Fore.GREEN + "Done")
            print("---")

+
 def main():
    if len(sys.argv) < 2:
        print("Usage: python convert.py <path_to_csv_files>")
@@ -66,5 +355,6 @@ def main():
    create_damage_files(base_path, output_base, prefix)
    print(Fore.YELLOW + Style.BRIGHT + "All files have been created successfully.")

+
 if __name__ == "__main__":
    main()
--- a/data/QUGS/test.py
+++ b/data/QUGS/test.py
@@ -0,0 +1,25 @@
+from convert import *
+from joblib import dump, load
+
+# a = generate_damage_files_index(
+#     num_damage=6, file_index_start=1, col=5, base_path="D:/thesis/data/dataset_A"
+# )
+
+b = generate_damage_files_index(
+    num_damage=6,
+    file_index_start=1,
+    col=5,
+    base_path="D:/thesis/data/dataset_B",
+    prefix="zzzBD",
+)
+# data_A = DataProcessor(file_index=a)
+# # data.create_vector_column(overwrite=True)
+# data_A.create_limited_sensor_vector_column(overwrite=True)
+# data_A.export_to_csv("D:/thesis/data/converted/raw")
+
+data_B = DataProcessor(file_index=b)
+# data.create_vector_column(overwrite=True)
+data_B.create_limited_sensor_vector_column(overwrite=True)
+data_B.export_to_csv("D:/thesis/data/converted/raw_B")
+# a = load("D:/cache.joblib")
+# breakpoint()
Author	SHA1	Message	Date
nuluh	1511012e11	refactor(test): update test script to generate damage files index for dataset_B and adjust export path for processed data	2025-04-20 16:02:16 +07:00
nuluh	db2947abdf	fix(data): fix the incorrect output of scipy.stft() data to be pandas.DataFrame shaped (513,513) along with its frequencies as the index and times as the columns (transposed) instead of just the magnitude that being flattened out; add checks for empty data and correct file paths for sensor data loading. Closes #43	2025-04-20 14:45:38 +07:00
nuluh	36b36c41ba	feat(data): add export_to_csv method for saving processed data into individuals sensor end and update test script Closes #40	2025-04-17 10:10:19 +07:00
Rifqi D. Panuluh	28681017ad	Merge pull request #39 from nuluh/feature/38-feat-redesign-convertpy Feature/38 feat redesign `convert.py`	2025-03-22 19:57:20 +07:00
nuluh	ff64f3a3ab	refactor(data): update type annotations for damage files index and related classes. Need better implementation	2025-03-22 19:48:50 +07:00
nuluh	58a316d9c8	feat(data): implement damage files index generation and data processing Closes #38	2025-03-21 15:58:50 +07:00