feat(notebook): Add evaluation metrics and confusion matrix visualizations for model predictions on Dataset B. Remove commented-out code and integrate data preparation using create_ready_data function.

docs(README): add instructions for running stft.ipynb notebook
fix(.gitignore): add rule to ignore egg-info directories and ensure proper formatting
2025-04-24 16:13:50 +07:00 · 2025-04-24 10:23:31 +07:00 · 2025-04-24 10:21:28 +07:00 · 2025-04-24 10:21:07 +07:00 · 2025-04-24 09:32:22 +07:00 · 2025-04-23 12:48:15 +07:00
9 changed files with 383 additions and 167 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 # Ignore CSV files in the data directory and all its subdirectories
 data/**/*.csv
 .venv/
-*.pyc
+*.pyc
 *.egg-info/
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,3 +1,4 @@
 {
-  "python.analysis.extraPaths": ["./code/src/features"]
+  "python.analysis.extraPaths": ["./code/src/features"],
  "jupyter.notebookFileRoot": "${workspaceFolder}/code"
 }
--- a/README.md
+++ b/README.md
@@ -16,3 +16,8 @@ The repository is private and access is restricted only to those who have been g
 All contents of this repository, including the thesis idea, code, and associated data, are copyrighted © 2024 by Rifqi Panuluh. Unauthorized use or duplication is prohibited.
 [LICENSE](https://github.com/nuluh/thesis?tab=License-1-ov-file#readme)
 ## How to Run `stft.ipynb`
 1. run `pip install -e .` in root project first
 2. run the notebook
--- a/code/notebooks/stft.ipynb
+++ b/code/notebooks/stft.ipynb
@@ -121,8 +121,9 @@
    "signal_sensor2_test1 = []\n",
    "\n",
    "for data in df:\n",
-    "    signal_sensor1_test1.append(data['sensor 1'].values)\n",
+    "    if not data.empty and 'sensor 1' in data.columns and 'sensor 2' in data.columns:\n",
-    "    signal_sensor2_test1.append(data['sensor 2'].values)\n",
+    "        signal_sensor1_test1.append(data['sensor 1'].values)\n",
    "        signal_sensor2_test1.append(data['sensor 2'].values)\n",
    "\n",
    "print(len(signal_sensor1_test1))\n",
    "print(len(signal_sensor2_test1))"
@@ -154,9 +155,7 @@
    "import pandas as pd\n",
    "import numpy as np\n",
    "from scipy.signal import stft, hann\n",
-    "from multiprocessing import Pool\n",
+    "# from multiprocessing import Pool\n",
    "\n",
    "\n",
    "\n",
    "# Function to compute and append STFT data\n",
    "def process_stft(args):\n",
@@ -199,23 +198,22 @@
    "    # Compute STFT\n",
    "    frequencies, times, Zxx = stft(sensor_data, fs=Fs, window=window, nperseg=window_size, noverlap=window_size - hop_size)\n",
    "    magnitude = np.abs(Zxx)\n",
-    "    flattened_stft = magnitude.flatten()\n",
+    "    df_stft = pd.DataFrame(magnitude, index=frequencies, columns=times).T\n",
    "    df_stft.columns = [f\"Freq_{i}\" for i in frequencies]\n",
    "    \n",
    "    # Define the output CSV file path\n",
    "    stft_file_name = f'stft_data{sensor_num}_{damage_num}.csv'\n",
    "    sensor_output_dir = os.path.join(damage_base_path, sensor_name.lower())\n",
    "    os.makedirs(sensor_output_dir, exist_ok=True)\n",
    "    stft_file_path = os.path.join(sensor_output_dir, stft_file_name)\n",
    "    print(stft_file_path)\n",
    "    # Append the flattened STFT to the CSV\n",
    "    try:\n",
    "        flattened_stft_df = pd.DataFrame([flattened_stft])\n",
    "        if not os.path.isfile(stft_file_path):\n",
    "            # Create a new CSV\n",
-    "            flattened_stft_df.to_csv(stft_file_path, index=False, header=False)\n",
+    "            df_stft.to_csv(stft_file_path, index=False, header=False)\n",
    "        else:\n",
    "            # Append to existing CSV\n",
-    "            flattened_stft_df.to_csv(stft_file_path, mode='a', index=False, header=False)\n",
+    "            df_stft.to_csv(stft_file_path, mode='a', index=False, header=False)\n",
    "        print(f\"Appended STFT data to {stft_file_path}\")\n",
    "    except Exception as e:\n",
    "        print(f\"Error writing to {stft_file_path}: {e}\")"
@@ -295,7 +293,7 @@
    "\n",
    "# get current y ticks in list\n",
    "print(len(frequencies))\n",
-    "print(len(times))\n"
+    "print(len(times))"
   ]
  },
  {
@@ -323,10 +321,9 @@
   "source": [
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
-    "ready_data1 = []\n",
+    "ready_data1a = []\n",
    "for file in os.listdir('D:/thesis/data/converted/raw/sensor1'):\n",
-    "    ready_data1.append(pd.read_csv(os.path.join('D:/thesis/data/converted/raw/sensor1', file)))\n",
+    "    ready_data1a.append(pd.read_csv(os.path.join('D:/thesis/data/converted/raw/sensor1', file)))\n",
    "ready_data1[0]\n",
    "# colormesh give title x is frequency and y is time and rotate/transpose the data\n",
    "# Plotting the STFT Data"
   ]
@@ -337,8 +334,8 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "ready_data1[1]\n",
+    "len(ready_data1a)\n",
-    "plt.pcolormesh(ready_data1[1])"
+    "# plt.pcolormesh(ready_data1[0])"
   ]
  },
  {
@@ -348,7 +345,7 @@
   "outputs": [],
   "source": [
    "for i in range(6):\n",
-    "    plt.pcolormesh(ready_data1[i])\n",
+    "    plt.pcolormesh(ready_data1a[i])\n",
    "    plt.title(f'STFT Magnitude for case {i} sensor 1')\n",
    "    plt.xlabel(f'Frequency [Hz]')\n",
    "    plt.ylabel(f'Time [sec]')\n",
@@ -361,10 +358,9 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "ready_data2 = []\n",
+    "ready_data2a = []\n",
    "for file in os.listdir('D:/thesis/data/converted/raw/sensor2'):\n",
-    "    ready_data2.append(pd.read_csv(os.path.join('D:/thesis/data/converted/raw/sensor2', file)))\n",
+    "    ready_data2a.append(pd.read_csv(os.path.join('D:/thesis/data/converted/raw/sensor2', file)))"
    "ready_data2[5]"
   ]
  },
  {
@@ -373,8 +369,8 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "print(len(ready_data1))\n",
+    "print(len(ready_data1a))\n",
-    "print(len(ready_data2))"
+    "print(len(ready_data2a))"
   ]
  },
  {
@@ -383,35 +379,16 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "x1 = 0\n",
+    "x1a = 0\n",
-    "\n",
+    "print(type(ready_data1a[0]))\n",
-    "for i in range(len(ready_data1)):\n",
+    "ready_data1a[0].iloc[:,0]"
    "    print(ready_data1[i].shape)\n",
    "    x1 = x1 + ready_data1[i].shape[0]\n",
    "\n",
    "print(x1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "x2 = 0\n",
    "\n",
    "for i in range(len(ready_data2)):\n",
    "    print(ready_data2[i].shape)\n",
    "    x2 = x2 + ready_data2[i].shape[0]\n",
    "\n",
    "print(x2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "### Appending"
+    "#### Checking length of the total array"
   ]
  },
  {
@@ -420,28 +397,14 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "x1 = ready_data1[0]\n",
+    "x1a = 0\n",
-    "# print(x1)\n",
+    "print(type(x1a))\n",
-    "print(type(x1))\n",
+    "for i in range(len(ready_data1a)):\n",
-    "for i in range(len(ready_data1) - 1):\n",
+    "    print(type(ready_data1a[i].shape[0]))\n",
-    "    #print(i)\n",
+    "    x1a = x1a + ready_data1a[i].shape[0]\n",
-    "    x1 = np.concatenate((x1, ready_data1[i + 1]), axis=0)\n",
+    "    print(type(x1a))\n",
    "# print(x1)\n",
    "pd.DataFrame(x1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "x2 = ready_data2[0]\n",
    "\n",
-    "for i in range(len(ready_data2) - 1):\n",
+    "print(x1a)"
    "    #print(i)\n",
    "    x2 = np.concatenate((x2, ready_data2[i + 1]), axis=0)\n",
    "pd.DataFrame(x2)"
   ]
  },
  {
@@ -450,15 +413,75 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "print(x1.shape)\n",
+    "x2a = 0\n",
-    "print(x2.shape)"
+    "\n",
    "for i in range(len(ready_data2a)):\n",
    "    print(ready_data2a[i].shape)\n",
    "    x2a = x2a + ready_data2a[i].shape[0]\n",
    "\n",
    "print(x2a)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "### Labeling"
+    "### Flatten 6 array into one array"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Combine all dataframes in ready_data1a into a single dataframe\n",
    "if ready_data1a:  # Check if the list is not empty\n",
    "    # Use pandas concat function instead of iterative concatenation\n",
    "    combined_data = pd.concat(ready_data1a, axis=0, ignore_index=True)\n",
    "    \n",
    "    print(f\"Type of combined data: {type(combined_data)}\")\n",
    "    print(f\"Shape of combined data: {combined_data.shape}\")\n",
    "    \n",
    "    # Display the combined dataframe\n",
    "    combined_data\n",
    "else:\n",
    "    print(\"No data available in ready_data1a list\")\n",
    "    combined_data = pd.DataFrame()\n",
    "\n",
    "# Store the result in x1a for compatibility with subsequent code\n",
    "x1a = combined_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Combine all dataframes in ready_data1a into a single dataframe\n",
    "if ready_data2a:  # Check if the list is not empty\n",
    "    # Use pandas concat function instead of iterative concatenation\n",
    "    combined_data = pd.concat(ready_data2a, axis=0, ignore_index=True)\n",
    "    \n",
    "    print(f\"Type of combined data: {type(combined_data)}\")\n",
    "    print(f\"Shape of combined data: {combined_data.shape}\")\n",
    "    \n",
    "    # Display the combined dataframe\n",
    "    combined_data\n",
    "else:\n",
    "    print(\"No data available in ready_data1a list\")\n",
    "    combined_data = pd.DataFrame()\n",
    "\n",
    "# Store the result in x1a for compatibility with subsequent code\n",
    "x2a = combined_data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Creating the label"
   ]
  },
  {
@@ -481,7 +504,8 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "y_data = [y_1, y_2, y_3, y_4, y_5, y_6]"
+    "y_data = [y_1, y_2, y_3, y_4, y_5, y_6]\n",
    "y_data"
   ]
  },
  {
@@ -491,7 +515,7 @@
   "outputs": [],
   "source": [
    "for i in range(len(y_data)):\n",
-    "    print(ready_data1[i].shape[0])"
+    "    print(ready_data1a[i].shape[0])"
   ]
  },
  {
@@ -500,19 +524,9 @@
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "for i in range(len(y_data)):\n",
-    "    print(ready_data2[i].shape[0])"
+    "    y_data[i] = [y_data[i]]*ready_data1a[i].shape[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(len(y_data)):\n",
    "    y_data[i] = [y_data[i]]*ready_data1[i].shape[0]\n",
    "    y_data[i] = np.array(y_data[i])"
   ]
  },
  {
@@ -522,7 +536,7 @@
   "outputs": [],
   "source": [
    "# len(y_data[0])\n",
-    "y_data[0]"
+    "y_data"
   ]
  },
  {
@@ -554,10 +568,10 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "from sklearn.model_selection import train_test_split\n",
+    "from src.ml.model_selection import create_ready_data\n",
    "\n",
-    "x_train1, x_test1, y_train, y_test = train_test_split(x1, y, test_size=0.2, random_state=2)\n",
+    "X1a, y = create_ready_data('D:/thesis/data/converted/raw/sensor1')\n",
-    "x_train2, x_test2, y_train, y_test = train_test_split(x2, y, test_size=0.2, random_state=2)"
+    "X2a, y = create_ready_data('D:/thesis/data/converted/raw/sensor2')"
   ]
  },
  {
@@ -567,6 +581,17 @@
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "x_train1, x_test1, y_train, y_test = train_test_split(X1a, y, test_size=0.2, random_state=2)\n",
    "x_train2, x_test2, y_train, y_test = train_test_split(X2a, y, test_size=0.2, random_state=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn.ensemble import RandomForestClassifier, BaggingClassifier\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
@@ -599,16 +624,17 @@
    "\n",
    "\n",
    "# 1. Random Forest\n",
-    "rf_model = RandomForestClassifier()\n",
+    "rf_model1 = RandomForestClassifier()\n",
-    "rf_model.fit(x_train1, y_train)\n",
+    "rf_model1.fit(x_train1, y_train)\n",
-    "rf_pred1 = rf_model.predict(x_test1)\n",
+    "rf_pred1 = rf_model1.predict(x_test1)\n",
    "acc1 = accuracy_score(y_test, rf_pred1) * 100\n",
    "accuracies1.append(acc1)\n",
    "# format with color coded if acc1 > 90\n",
    "acc1 = f\"\\033[92m{acc1:.2f}\\033[00m\" if acc1 > 90 else f\"{acc1:.2f}\"\n",
    "print(\"Random Forest Accuracy for sensor 1:\", acc1)\n",
-    "rf_model.fit(x_train2, y_train)\n",
+    "rf_model2 = RandomForestClassifier()\n",
-    "rf_pred2 = rf_model.predict(x_test2)\n",
+    "rf_model2.fit(x_train2, y_train)\n",
    "rf_pred2 = rf_model2.predict(x_test2)\n",
    "acc2 = accuracy_score(y_test, rf_pred2) * 100\n",
    "accuracies2.append(acc2)\n",
    "# format with color coded if acc2 > 90\n",
@@ -618,16 +644,17 @@
    "# print(y_test)\n",
    "\n",
    "# 2. Bagged Trees\n",
-    "bagged_model = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=10)\n",
+    "bagged_model1 = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=10)\n",
-    "bagged_model.fit(x_train1, y_train)\n",
+    "bagged_model1.fit(x_train1, y_train)\n",
-    "bagged_pred1 = bagged_model.predict(x_test1)\n",
+    "bagged_pred1 = bagged_model1.predict(x_test1)\n",
    "acc1 = accuracy_score(y_test, bagged_pred1) * 100\n",
    "accuracies1.append(acc1)\n",
    "# format with color coded if acc1 > 90\n",
    "acc1 = f\"\\033[92m{acc1:.2f}\\033[00m\" if acc1 > 90 else f\"{acc1:.2f}\"\n",
    "print(\"Bagged Trees Accuracy for sensor 1:\", acc1)\n",
-    "bagged_model.fit(x_train2, y_train)\n",
+    "bagged_model2 = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=10)\n",
-    "bagged_pred2 = bagged_model.predict(x_test2)\n",
+    "bagged_model2.fit(x_train2, y_train)\n",
    "bagged_pred2 = bagged_model2.predict(x_test2)\n",
    "acc2 = accuracy_score(y_test, bagged_pred2) * 100\n",
    "accuracies2.append(acc2)\n",
    "# format with color coded if acc2 > 90\n",
@@ -643,8 +670,9 @@
    "# format with color coded if acc1 > 90\n",
    "acc1 = f\"\\033[92m{acc1:.2f}\\033[00m\" if acc1 > 90 else f\"{acc1:.2f}\"\n",
    "print(\"Decision Tree Accuracy for sensor 1:\", acc1)\n",
-    "dt_model.fit(x_train2, y_train)\n",
+    "dt_model2 = DecisionTreeClassifier()\n",
-    "dt_pred2 = dt_model.predict(x_test2)\n",
+    "dt_model2.fit(x_train2, y_train)\n",
    "dt_pred2 = dt_model2.predict(x_test2)\n",
    "acc2 = accuracy_score(y_test, dt_pred2) * 100\n",
    "accuracies2.append(acc2)\n",
    "# format with color coded if acc2 > 90\n",
@@ -660,8 +688,9 @@
    "# format with color coded if acc1 > 90\n",
    "acc1 = f\"\\033[92m{acc1:.2f}\\033[00m\" if acc1 > 90 else f\"{acc1:.2f}\"\n",
    "print(\"KNeighbors Accuracy for sensor 1:\", acc1)\n",
-    "knn_model.fit(x_train2, y_train)\n",
+    "knn_model2 = KNeighborsClassifier()\n",
-    "knn_pred2 = knn_model.predict(x_test2)\n",
+    "knn_model2.fit(x_train2, y_train)\n",
    "knn_pred2 = knn_model2.predict(x_test2)\n",
    "acc2 = accuracy_score(y_test, knn_pred2) * 100\n",
    "accuracies2.append(acc2)\n",
    "# format with color coded if acc2 > 90\n",
@@ -677,8 +706,9 @@
    "# format with color coded if acc1 > 90\n",
    "acc1 = f\"\\033[92m{acc1:.2f}\\033[00m\" if acc1 > 90 else f\"{acc1:.2f}\"\n",
    "print(\"Linear Discriminant Analysis Accuracy for sensor 1:\", acc1)\n",
-    "lda_model.fit(x_train2, y_train)\n",
+    "lda_model2 = LinearDiscriminantAnalysis()\n",
-    "lda_pred2 = lda_model.predict(x_test2)\n",
+    "lda_model2.fit(x_train2, y_train)\n",
    "lda_pred2 = lda_model2.predict(x_test2)\n",
    "acc2 = accuracy_score(y_test, lda_pred2) * 100\n",
    "accuracies2.append(acc2)\n",
    "# format with color coded if acc2 > 90\n",
@@ -694,8 +724,9 @@
    "# format with color coded if acc1 > 90\n",
    "acc1 = f\"\\033[92m{acc1:.2f}\\033[00m\" if acc1 > 90 else f\"{acc1:.2f}\"\n",
    "print(\"Support Vector Machine Accuracy for sensor 1:\", acc1)\n",
-    "svm_model.fit(x_train2, y_train)\n",
+    "svm_model2 = SVC()\n",
-    "svm_pred2 = svm_model.predict(x_test2)\n",
+    "svm_model2.fit(x_train2, y_train)\n",
    "svm_pred2 = svm_model2.predict(x_test2)\n",
    "acc2 = accuracy_score(y_test, svm_pred2) * 100\n",
    "accuracies2.append(acc2)\n",
    "# format with color coded if acc2 > 90\n",
@@ -711,8 +742,9 @@
    "# format with color coded if acc1 > 90\n",
    "acc1 = f\"\\033[92m{acc1:.2f}\\033[00m\" if acc1 > 90 else f\"{acc1:.2f}\"\n",
    "print(\"XGBoost Accuracy:\", acc1)\n",
-    "xgboost_model.fit(x_train2, y_train)\n",
+    "xgboost_model2 = XGBClassifier()\n",
-    "xgboost_pred2 = xgboost_model.predict(x_test2)\n",
+    "xgboost_model2.fit(x_train2, y_train)\n",
    "xgboost_pred2 = xgboost_model2.predict(x_test2)\n",
    "acc2 = accuracy_score(y_test, xgboost_pred2) * 100\n",
    "accuracies2.append(acc2)\n",
    "# format with color coded if acc2 > 90\n",
@@ -789,57 +821,10 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "def spectograph(data_dir: str):\n",
+    "from src.ml.model_selection import create_ready_data\n",
    "    # print(os.listdir(data_dir))\n",
    "    for damage in os.listdir(data_dir):\n",
    "        # print(damage)\n",
    "        d = os.path.join(data_dir, damage)\n",
    "        # print(d)\n",
    "        for file in os.listdir(d):\n",
    "            # print(file)\n",
    "            f = os.path.join(d, file)\n",
    "            print(f)\n",
    "            # sensor1 = pd.read_csv(f, skiprows=1, sep=';')\n",
    "            # sensor2 = pd.read_csv(f, skiprows=1, sep=';')\n",
    "\n",
-    "            # df1 = pd.DataFrame()\n",
+    "X1b, y = create_ready_data('D:/thesis/data/converted/raw_B/sensor1')\n",
-    "\n",
+    "X2b, y = create_ready_data('D:/thesis/data/converted/raw_B/sensor2')"
    "            # df1['s1'] = sensor1[sensor1.columns[-1]]\n",
    "            # df1['s2'] = sensor2[sensor2.columns[-1]]\n",
    "            # # Combined Plot for sensor 1 and sensor 2 from data1 file in which motor is operated at 800 rpm\n",
    "\n",
    "            # plt.plot(df1['s2'], label='sensor 2')\n",
    "            # plt.plot(df1['s1'], label='sensor 1')\n",
    "            # plt.xlabel(\"Number of samples\")\n",
    "            # plt.ylabel(\"Amplitude\")\n",
    "            # plt.title(\"Raw vibration signal\")\n",
    "            # plt.legend()\n",
    "            # plt.show()\n",
    "\n",
    "            # from scipy import signal\n",
    "            # from scipy.signal.windows import hann\n",
    "\n",
    "            # vibration_data = df1['s1']\n",
    "\n",
    "            # # Applying STFT\n",
    "            # window_size = 1024\n",
    "            # hop_size = 512\n",
    "            # window = hann(window_size)  # Creating a Hanning window\n",
    "            # frequencies, times, Zxx = signal.stft(vibration_data, window=window, nperseg=window_size, noverlap=window_size - hop_size)\n",
    "\n",
    "            # # Plotting the STFT Data\n",
    "            # plt.pcolormesh(times, frequencies, np.abs(Zxx), shading='gouraud')\n",
    "            # plt.title(f'STFT Magnitude for case 1 signal sensor 1 ')\n",
    "            # plt.ylabel('Frequency [Hz]')\n",
    "            # plt.xlabel('Time [sec]')\n",
    "            # plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Test with Outside of Its Training Data"
   ]
  },
  {
@@ -847,7 +832,117 @@
   "execution_count": null,
   "metadata": {},
   "outputs": [],
-   "source": []
+   "source": [
    "y.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import accuracy_score, classification_report\n",
    "# 4. Validate on Dataset B\n",
    "y_pred_svm = svm_model.predict(X1b)\n",
    "\n",
    "# 5. Evaluate\n",
    "print(\"Accuracy on Dataset B:\", accuracy_score(y, y_pred_svm))\n",
    "print(classification_report(y, y_pred_svm))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import accuracy_score, classification_report\n",
    "# 4. Validate on Dataset B\n",
    "y_pred = rf_model2.predict(X2b)\n",
    "\n",
    "# 5. Evaluate\n",
    "print(\"Accuracy on Dataset B:\", accuracy_score(y, y_pred))\n",
    "print(classification_report(y, y_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_predict = svm_model2.predict(X2b.iloc[[5312],:])\n",
    "print(y_predict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y[5312]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Confusion Matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n",
    "\n",
    "\n",
    "cm = confusion_matrix(y, y_pred_svm) # -> ndarray\n",
    "\n",
    "# get the class labels\n",
    "labels = svm_model.classes_\n",
    "\n",
    "# Plot\n",
    "disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)\n",
    "disp.plot(cmap=plt.cm.Blues)  # You can change colormap\n",
    "plt.title(\"SVM Sensor1 CM Train w/ Dataset A Val w/ Dataset B\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Self-test CM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1. Predict sensor 1 on Dataset A\n",
    "y_train_pred = svm_model.predict(x_train1)\n",
    "\n",
    "# 2. Import confusion matrix tools\n",
    "from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# 3. Create and plot confusion matrix\n",
    "cm_train = confusion_matrix(y_train, y_train_pred)\n",
    "labels = svm_model.classes_\n",
    "\n",
    "disp = ConfusionMatrixDisplay(confusion_matrix=cm_train, display_labels=labels)\n",
    "disp.plot(cmap=plt.cm.Blues)\n",
    "plt.title(\"Confusion Matrix: Train & Test on Dataset A\")\n",
    "plt.show()\n"
   ]
  }
 ],
 "metadata": {
--- a/code/src/ml/init.py
+++ b/code/src/ml/init.py
--- a/code/src/ml/model_selection.py
+++ b/code/src/ml/model_selection.py
@@ -0,0 +1,57 @@
 import numpy as np
 import pandas as pd
 import os
 from sklearn.model_selection import train_test_split as sklearn_split
 def create_ready_data(
    stft_data_path: str,
    stratify: np.ndarray = None,
 ) -> tuple:
    """
    Create a stratified train-test split from STFT data.
    Parameters:
    -----------
    stft_data_path : str
        Path to the directory containing STFT data files (e.g. 'data/converted/raw/sensor1')
    stratify : np.ndarray, optional
        Labels to use for stratified sampling
    Returns:
    --------
    tuple
        (X_train, X_test, y_train, y_test) - Split datasets
    """
    ready_data = []
    for file in os.listdir(stft_data_path):
        ready_data.append(pd.read_csv(os.path.join(stft_data_path, file)))
    y_data = [i for i in range(len(ready_data))]
    # Combine all dataframes in ready_data into a single dataframe
    if ready_data:  # Check if the list is not empty
        # Use pandas concat function instead of iterative concatenation
        combined_data = pd.concat(ready_data, axis=0, ignore_index=True)
        print(f"Type of combined data: {type(combined_data)}")
        print(f"Shape of combined data: {combined_data.shape}")
    else:
        print("No data available in ready_data list")
        combined_data = pd.DataFrame()
    # Store the result in x1a for compatibility with subsequent code
    X = combined_data
    for i in range(len(y_data)):
        y_data[i] = [y_data[i]] * ready_data[i].shape[0]
        y_data[i] = np.array(y_data[i])
    if y_data:
        # Use numpy concatenate function instead of iterative concatenation
        y = np.concatenate(y_data, axis=0)
    else:
        print("No labels available in y_data list")
        y = np.array([])
    return X, y
--- a/data/QUGS/convert.py
+++ b/data/QUGS/convert.py
@@ -2,6 +2,7 @@ import pandas as pd
 import os
 import re
 import sys
 import numpy as np
 from colorama import Fore, Style, init
 from typing import TypedDict, Dict, List
 from joblib import load
@@ -225,25 +226,56 @@ class DataProcessor:
        """
        idx = self._create_vector_column_index()
        # if overwrite:
-        for i in range(len(self.data)):
+        for i in range(len(self.data)):  # damage(s)
-            for j in range(len(self.data[i])):
+            for j in range(len(self.data[i])):  # col(s)
                # Get the appropriate indices for slicing from idx
                indices = idx[j]
                # Get the current DataFrame
                df = self.data[i][j]
-                # Keep the 'Time' column and select only specified 'Real' columns
+                # Keep the 'Time' column and select only specifid 'Real' colmns
-                # First, we add 1 to all indices to account for 'Time' being at position 0
+                # First, we add 1 to all indices to acount for 'Time' being at positiion 0
                real_indices = [index + 1 for index in indices]
-                # Create list with Time column index (0) and the adjusted Real indices
+                # Create list with Time column index (0) and the adjustedd Real indices
                all_indices = [0] + [real_indices[0]] + [real_indices[-1]]
                # Apply the slicing
                self.data[i][j] = df.iloc[:, all_indices]
        # TODO: if !overwrite:
    def export_to_csv(self, output_dir: str, file_prefix: str = "DAMAGE"):
        """
        Export the processed data to CSV files in the required folder structure.
        :param output_dir: Directory to save the CSV files.
        :param file_prefix: Prefix for the output filenames.
        """
        for group_idx, group in enumerate(self.data, start=1):
            group_folder = os.path.join(output_dir, f"{file_prefix}_{group_idx}")
            os.makedirs(group_folder, exist_ok=True)
            for test_idx, df in enumerate(group, start=1):
                # Ensure columns are named uniquely if duplicated
                df = df.copy()
                df.columns = ["Time", "Real_0", "Real_1"]  # Rename
                # Export first Real column
                out1 = os.path.join(
                    group_folder, f"{file_prefix}_{group_idx}_TEST{test_idx}_01.csv"
                )
                df[["Time", "Real_0"]].rename(columns={"Real_0": "Real"}).to_csv(
                    out1, index=False
                )
                # Export last Real column
                out2 = os.path.join(
                    group_folder, f"{file_prefix}_{group_idx}_TEST{test_idx}_02.csv"
                )
                df[["Time", "Real_1"]].rename(columns={"Real_1": "Real"}).to_csv(
                    out2, index=False
                )
 def create_damage_files(base_path, output_base, prefix):
    # Initialize colorama
--- a/data/QUGS/test.py
+++ b/data/QUGS/test.py
@@ -4,5 +4,22 @@ from joblib import dump, load
 # a = generate_damage_files_index(
 #     num_damage=6, file_index_start=1, col=5, base_path="D:/thesis/data/dataset_A"
 # )
-# dump(DataProcessor(file_index=a), "D:/cache.joblib")
+
-a = load("D:/cache.joblib")
+b = generate_damage_files_index(
    num_damage=6,
    file_index_start=1,
    col=5,
    base_path="D:/thesis/data/dataset_B",
    prefix="zzzBD",
 )
 # data_A = DataProcessor(file_index=a)
 # # data.create_vector_column(overwrite=True)
 # data_A.create_limited_sensor_vector_column(overwrite=True)
 # data_A.export_to_csv("D:/thesis/data/converted/raw")
 data_B = DataProcessor(file_index=b)
 # data.create_vector_column(overwrite=True)
 data_B.create_limited_sensor_vector_column(overwrite=True)
 data_B.export_to_csv("D:/thesis/data/converted/raw_B")
 # a = load("D:/cache.joblib")
 # breakpoint()
--- a/setup.py
+++ b/setup.py
@@ -0,0 +1,8 @@
 from setuptools import setup, find_packages
 setup(
    name="thesisrepo",
    version="0.1",
    packages=find_packages(where="code"),
    package_dir={"": "code"},
 )
Author	SHA1	Message	Date
nuluh	5b0b3dd4e5	feat(notebook): Add evaluation metrics and confusion matrix visualizations for model predictions on Dataset B. Remove commented-out code and integrate data preparation using create_ready_data function.	2025-04-24 16:13:50 +07:00
nuluh	00d1d55181	docs(README): add instructions for running stft.ipynb notebook	2025-04-24 10:23:31 +07:00
nuluh	8a166e8b11	fix(.gitignore): add rule to ignore egg-info directories and ensure proper formatting	2025-04-24 10:21:28 +07:00
nuluh	eb62c7e614	feat(notebook): Update variable names for clarity, remove unused imports, and streamline data processing. Implement data concatenation using pandas concat for efficiency. Add validation steps for Dataset B and improve model training consistency across sensors.	2025-04-24 10:21:07 +07:00
nuluh	cba4a00cd8	feat(src): implement working function for dataset B to create ready data from STFT files stft_files and add setup.py for package configuration	2025-04-24 09:32:22 +07:00
nuluh	90a5a76609	wip: add function to create stratified train-test split from STFT data	2025-04-23 12:48:15 +07:00
nuluh	c8509aa728	fix(notebooks): fix out of index stft plotting iteration	2025-04-22 10:55:34 +07:00
nuluh	4ebfb52635	Merge branch '40-feat-add-export-to-csv-method-for-dataprocessor-in-convertpy'	2025-04-21 00:16:39 +07:00
nuluh	1511012e11	refactor(test): update test script to generate damage files index for dataset_B and adjust export path for processed data	2025-04-20 16:02:16 +07:00
nuluh	db2947abdf	fix(data): fix the incorrect output of scipy.stft() data to be pandas.DataFrame shaped (513,513) along with its frequencies as the index and times as the columns (transposed) instead of just the magnitude that being flattened out; add checks for empty data and correct file paths for sensor data loading. Closes #43	2025-04-20 14:45:38 +07:00
nuluh	36b36c41ba	feat(data): add export_to_csv method for saving processed data into individuals sensor end and update test script Closes #40	2025-04-17 10:10:19 +07:00