From 9e3e234ef65d84ccfc9a6b0719fe9236b26be0a2 Mon Sep 17 00:00:00 2001
From: nuluh <dam.ar@outlook.com>
Date: Wed, 15 Oct 2025 13:14:17 +0700
Subject: [PATCH] feat(notebooks): major changes

---
 code/notebooks/stft.ipynb | 1895 +++++++++++++++++++++++++++++++++++--
 1 file changed, 1841 insertions(+), 54 deletions(-)

diff --git a/code/notebooks/stft.ipynb b/code/notebooks/stft.ipynb
index 3e661b7..3e29c43 100644
--- a/code/notebooks/stft.ipynb
+++ b/code/notebooks/stft.ipynb
@@ -370,29 +370,30 @@
    "source": [
     "#### Undamaged case (d0)\n",
     "![Overview of the signal preprocessing for undamaged case](attachment:image.png)\n",
+    "\n",
     "The figure above shows the overview of the signal preprocessing pipeline for undamaged case $(d_0)$. Notice that undamaged case $(d_0)$ is formed from complementary of each `zzzAD{n}` in damage case $(d_1—d_6)$ as explained in [Generate undamaged case index from complementary pairs for each damaged file of Dataset A](#Generate-undamaged-case-index-from-complementary-pairs-for-each-damaged-file-of-Dataset-A).\n",
     "\n",
     "To balance the data for undamaged case $(d_0)$, each produced STFT from $d_0$ ($513 \\times 513$) will only be took its first $22$ timeframes for the first 45 $(\\verb|{test_num}|\\le 45)$ input data and its first $21$ timeframes for the rest of the input data $(\\verb|{test_num}|>45)$. This number was determined by solving linear equation \n",
     "\n",
     "$$\n",
     "\\begin{align*}\n",
-    "\\frac{2565}{120} = 21.375\n",
+    "\\frac{2565}{125} = 20.52\n",
     "\\begin{cases}\n",
-    "21 & \\text{or} \\\\\n",
-    "22\n",
+    "20 & \\text{or} \\\\\n",
+    "21\n",
     "\\end{cases}\n",
     "\\end{align*}\n",
     "$$\n",
     "\n",
     "$$\n",
     "\\begin{align*}\n",
-    "21x + 22y &= 2565 \\\\\n",
-    "x + y &= 120 \\\\\n",
+    "20x + 21y &= 2565 \\\\\n",
+    "x + y &= 125 \\\\\n",
     "\\end{align*}\n",
     "$$\n",
     "\n",
     "$$\n",
-    "y = 45, x = 75\n",
+    "x = 60, y = 65\n",
     "$$\n",
     "\n",
     "to achieve same samples number $(2565)$ for all labels $(d_0—d_6)$."
@@ -450,7 +451,7 @@
     "        plt.yticks(np.linspace(0, len(data.columns)-1, y_num_ticks))  # Set y-ticks at regular intervals\n",
     "        plt.rcParams['svg.fonttype'] = 'none'\n",
     "        plt.rcParams['text.usetex'] = False     # use mathtext, not external LaTeX\n",
-    "        plt.savefig(\"output_single.svg\", format=\"svg\", dpi=100)\n",
+    "        plt.savefig(\"output_single.svg\", facecolor='none', format=\"svg\", dpi=100)\n",
     "        plt.show()\n",
     "\n",
     "    elif type(data) == list and len(data) > 1:\n",
@@ -483,7 +484,7 @@
     "        plt.rcParams['svg.fonttype'] = 'none'\n",
     "        plt.rcParams['text.usetex'] = False     # use mathtext, not external LaTeX\n",
     "        mpl.rcParams['text.usetex'] = False     # use mathtext, not external LaTeX\n",
-    "        plt.savefig(\"output_multiple.svg\", format=\"svg\", dpi=80)\n",
+    "        plt.savefig(\"output_multiple.svg\", facecolor='none', format=\"svg\", dpi=80)\n",
     "        plt.show()"
    ]
   },
@@ -607,11 +608,11 @@
     "X1a, y = create_ready_data('D:/thesis/data/converted/raw/sensor1')\n",
     "\n",
     "# Display DataFrame\n",
-    "pd.concat([\n",
-    "           pd.DataFrame(y, columns=['label']),   # Labels\n",
-    "           X1a,                                 # Features\n",
-    "           ], \n",
-    "           axis=1)"
+    "# pd.concat([\n",
+    "#            pd.DataFrame(y, columns=['label']),   # Labels\n",
+    "#            X1a,                                 # Features\n",
+    "#            ], \n",
+    "#            axis=1)"
    ]
   },
   {
@@ -623,11 +624,11 @@
     "X2a, y = create_ready_data('D:/thesis/data/converted/raw/sensor2')\n",
     "\n",
     "# Display DataFrame\n",
-    "pd.concat([\n",
-    "           pd.DataFrame(y, columns=['label']),   # Labels\n",
-    "           X2a,                                 # Features\n",
-    "           ], \n",
-    "           axis=1)"
+    "# pd.concat([\n",
+    "#            pd.DataFrame(y, columns=['label']),   # Labels\n",
+    "#            X2a,                                 # Features\n",
+    "#            ], \n",
+    "#            axis=1)"
    ]
   },
   {
@@ -664,25 +665,143 @@
     "print(\"Shape of y2_train:\", y_train1.shape)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Scree plot"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "from src.ml.model_selection import train_and_evaluate_model\n",
-    "from sklearn.svm import SVC\n",
-    "from sklearn.pipeline import make_pipeline\n",
-    "from sklearn.preprocessing import StandardScaler\n",
-    "from sklearn.svm import SVC\n",
+    "import matplotlib.pyplot as plt\n",
     "from sklearn.decomposition import PCA\n",
-    "# from xgboost import XGBClassifier\n",
-    "# from sklearn.ensemble import RandomForestClassifier, BaggingClassifier\n",
-    "# from sklearn.tree import DecisionTreeClassifier\n",
-    "# from sklearn.neighbors import KNeighborsClassifier\n",
-    "# from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n",
-    "from sklearn.neural_network import MLPClassifier\n",
-    "\n"
+    "from sklearn.preprocessing import StandardScaler\n",
+    "\n",
+    "# Assuming X is your dataset\n",
+    "scaler1 = StandardScaler()\n",
+    "scaler2 = StandardScaler()\n",
+    "X_scaled1 = scaler1.fit_transform(x_train1)\n",
+    "X_scaled2 = scaler2.fit_transform(x_train2)\n",
+    "\n",
+    "# Perform PCA\n",
+    "pca1 = PCA()\n",
+    "pca1.fit(X_scaled1)\n",
+    "pca2 = PCA()\n",
+    "pca2.fit(X_scaled2)\n",
+    "\n",
+    "# # Explained variance ratio\n",
+    "# explained_variance_ratio = pca1.explained_variance_ratio_\n",
+    "\n",
+    "# # Create the scree plot\n",
+    "# plt.figure(figsize=(8, 6))\n",
+    "# plt.plot(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio, marker='o', linestyle='--')\n",
+    "# plt.title('Scree Plot')\n",
+    "# plt.xlabel('Principal Component')\n",
+    "# plt.ylabel('Explained Variance Ratio')\n",
+    "# plt.grid()\n",
+    "# plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "\n",
+    "# Original data - Cumulative explained variance for first dataset\n",
+    "cumulative_variance = np.cumsum(pca.explained_variance_ratio_)\n",
+    "n_components_95 = np.argmax(cumulative_variance >= 0.95) + 1\n",
+    "n_components_16 = 16\n",
+    "cumulative_variance_16 = cumulative_variance[n_components_16 - 1]\n",
+    "\n",
+    "# Create figure and primary axis\n",
+    "fig, ax1 = plt.subplots(figsize=(8, 6))\n",
+    "\n",
+    "# Plot first dataset on primary axis\n",
+    "ax1.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o', linestyle='--', \n",
+    "         label='Dataset 1', color='blue')\n",
+    "ax1.axhline(y=0.95, color='r', linestyle='--', label='95% Variance')\n",
+    "ax1.axvline(x=n_components_95, color='g', linestyle='--', label=f'n={n_components_95}')\n",
+    "ax1.axvline(x=n_components_16, color='b', linestyle='--', label=f'n={n_components_16} ({cumulative_variance_16:.2f})')\n",
+    "ax1.axhline(y=cumulative_variance_16, color='b', linestyle='--')\n",
+    "\n",
+    "# Set labels and properties for first dataset\n",
+    "ax1.set_xlabel('Principal Component')\n",
+    "ax1.set_ylabel('Dataset 1 Variance Ratio', color='blue')\n",
+    "ax1.tick_params(axis='y', labelcolor='blue')\n",
+    "ax1.grid(True, alpha=0.3)\n",
+    "\n",
+    "# Create secondary y-axis that shares the same x-axis\n",
+    "ax2 = ax1.twinx()\n",
+    "\n",
+    "# Example second dataset (replace with your actual data)\n",
+    "# For demonstration, I'm creating synthetic data with a different scale\n",
+    "second_dataset = np.sqrt(cumulative_variance)  # Just an example - replace with your actual data\n",
+    "\n",
+    "# Plot second dataset on secondary axis\n",
+    "ax2.plot(range(1, len(second_dataset) + 1), second_dataset, marker='s', linestyle='-', \n",
+    "         color='red', label='Dataset 2')\n",
+    "\n",
+    "# Set properties for second dataset\n",
+    "ax2.set_ylabel('Dataset 2 Variance Ratio', color='red')\n",
+    "ax2.tick_params(axis='y', labelcolor='red')\n",
+    "\n",
+    "# Create combined legend\n",
+    "lines1, labels1 = ax1.get_legend_handles_labels()\n",
+    "lines2, labels2 = ax2.get_legend_handles_labels()\n",
+    "ax1.legend(lines1 + lines2, labels1 + labels2, loc='best')\n",
+    "\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Test the stability of PCA"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.decomposition import PCA\n",
+    "from sklearn.utils import resample\n",
+    "import numpy as np\n",
+    "\n",
+    "angles = []\n",
+    "\n",
+    "for i in range(100):  # bootstrap 100 times\n",
+    "    Xb = resample(X1a)\n",
+    "    pca = PCA().fit(Xb)\n",
+    "    v1 = pca.components_[1]  # first eigenvector\n",
+    "    if i > 0:\n",
+    "        angle = np.abs(np.dot(v1, prev_v1))  # cosine similarity\n",
+    "        angles.append(angle)\n",
+    "    prev_v1 = v1\n",
+    "\n",
+    "np.mean(angles)  # close to 1 = stable; lower = unstable"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sklearnex\n",
+    "sklearnex.patch_sklearn()\n"
    ]
   },
   {
@@ -698,28 +817,448 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Define models for sensor1\n",
-    "models_sensor1 = {\n",
+    "from src.ml.model_selection import train_and_evaluate_model\n",
+    "from sklearn.svm import SVC\n",
+    "from sklearn.pipeline import Pipeline\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "from sklearn.svm import SVC\n",
+    "from sklearn.decomposition import PCA\n",
+    "# from xgboost import XGBClassifier\n",
+    "# from sklearn.ensemble import RandomForestClassifier, BaggingClassifier\n",
+    "# from sklearn.tree import DecisionTreeClassifier\n",
+    "# from sklearn.neighbors import KNeighborsClassifier\n",
+    "# from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n",
+    "# from sklearn.neural_network import MLPClassifier\n",
+    "\n",
+    "from sklearn.model_selection import StratifiedKFold, GridSearchCV\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Checking explained variance ratio using PCA"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import os\n",
+    "\n",
+    "from sklearn.manifold import TSNE\n",
+    "def plot_tsne(pca, y, n, save_to):\n",
+    "    tsne = TSNE(n_components=2, perplexity=70, learning_rate=200, random_state=10)\n",
+    "    X_tsne = tsne.fit_transform(pca)\n",
+    "    # set size\n",
+    "    plt.figure(figsize=(8, 6), dpi=300)\n",
+    "    # add color bar\n",
+    "    # remove plot area lines\n",
+    "    plt.gca().spines['top'].set_visible(False)\n",
+    "    plt.gca().spines['right'].set_visible(False)\n",
+    "    plt.gca().spines['bottom'].set_visible(False)\n",
+    "    plt.gca().spines['left'].set_visible(False)\n",
+    "\n",
+    "    # remove ticks\n",
+    "    plt.xticks([])\n",
+    "    plt.yticks([])\n",
+    "\n",
+    "    # Example: X_tsne is (n_samples, 2) array; y_train1 is class labels (0, 1, 2, ...)\n",
+    "    # X_tsne, y_train1 = ...\n",
+    "\n",
+    "    # Choose a color map (tab10 is good for up to 10 classes)\n",
+    "    cmap = plt.get_cmap('tab10')\n",
+    "\n",
+    "    # Get the unique class labels\n",
+    "    classes = np.unique(y)\n",
+    "\n",
+    "    # Create the scatter plot, one class at a time\n",
+    "    for i, cls in enumerate(classes):\n",
+    "        plt.scatter(\n",
+    "            X_tsne[y == cls, 0],\n",
+    "            X_tsne[y == cls, 1],\n",
+    "            color=cmap(i),\n",
+    "            label=str(cls),\n",
+    "            alpha=0.6\n",
+    "        )\n",
+    "\n",
+    "    # Add title and legend\n",
+    "    # plt.title(\"t-SNE visualization with class labels\")\n",
+    "    plt.legend(title=\"Classes\")\n",
+    "    if save_to:\n",
+    "        if not os.path.exists(save_to):\n",
+    "            os.makedirs(save_to)\n",
+    "        plt.savefig(f\"{save_to}/tsne_pca{n}.png\", dpi=300)\n",
+    "    plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "import pacmap\n",
+    "import os\n",
+    "\n",
+    "def plot_pacmap(pca, y, n, save_to):\n",
+    "    embedding = pacmap.PaCMAP(n_components=2, n_neighbors=20, MN_ratio=1, FP_ratio=2.0, random_state=10)\n",
+    "    X_pacmap = embedding.fit_transform(pca)\n",
+    "\n",
+    "    plt.figure(figsize=(8, 6), dpi=300)\n",
+    "    # Choose a color map (tab10 is good for up to 10 classes)\n",
+    "    cmap = plt.get_cmap('tab10')\n",
+    "\n",
+    "    # Get the unique class labels\n",
+    "    classes = np.unique(y)\n",
+    "\n",
+    "    # Create the scatter plot, one class at a time\n",
+    "    for i, cls in enumerate(classes):\n",
+    "        plt.scatter(\n",
+    "            X_pacmap[y == cls, 0],\n",
+    "            X_pacmap[y == cls, 1],\n",
+    "            color=cmap(i),\n",
+    "            label=str(cls),\n",
+    "            alpha=0.6\n",
+    "        )\n",
+    "    # legend\n",
+    "    plt.gca().spines['top'].set_visible(False)\n",
+    "    plt.gca().spines['right'].set_visible(False)\n",
+    "    plt.gca().spines['bottom'].set_visible(False)\n",
+    "    plt.gca().spines['left'].set_visible(False)\n",
+    "    # remove ticks\n",
+    "    plt.xticks([])\n",
+    "    plt.yticks([])\n",
+    "    plt.tight_layout()\n",
+    "    plt.legend()\n",
+    "    if save_to:\n",
+    "        if not os.path.exists(save_to):\n",
+    "            os.makedirs(save_to)\n",
+    "        plt.savefig(f\"{save_to}/pacmap_pca{n}.png\", dpi=300)\n",
+    "    plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "\n",
+    "scaler = StandardScaler()\n",
+    "X_scaled = scaler.fit_transform(x_train2)  \n",
+    "for n in [512]:\n",
+    "    # pca = PCA(n_components=n).fit_transform(X_scaled)\n",
+    "    pca = X_scaled\n",
+    "    plot_tsne(pca, y_train2, n, save_to=\"D:/thesis/figures/Sensor B\")\n",
+    "    plot_pacmap(pca, y_train2, n, save_to=\"D:/thesis/figures/Sensor B\")\n",
+    "# plt.plot(np.cumsum(pca.explained_variance_ratio_))\n",
+    "# plt.xlabel('number of components')\n",
+    "# plt.ylabel('cumulative explained variance')\n",
+    "# # set y ticks with step 0.05\n",
+    "# plt.yticks(np.arange(0, 1.05, 0.05))\n",
+    "# plt.grid()\n",
+    "# # show number of components where cumulative explained variance = 0.95\n",
+    "# n_95 = np.argmax(np.cumsum(pca.explained_variance_ratio_) >= 0.95) + 1\n",
+    "# plt.axvline(n_95, color='r', linestyle='--')\n",
+    "# plt.text(n_95+2, 0.5, f'n={n_95}', color='r')\n",
+    "# plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pca = PCA(n_components=0.95).fit(X_scaled)\n",
+    "pca.components_"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loadings = pd.DataFrame(pca.components_.T,\n",
+    "                        columns=[f\"PC{i+1}\" for i in range(pca.n_components_)],\n",
+    "                        index=X1a.columns)\n",
+    "loadings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pca.explained_variance_ratio_[:10]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "top = loadings.iloc[:, 0].abs().nlargest(5)\n",
+    "loadings.loc[top.index, :].abs()*loadings.loc[top.index, :].apply(np.sign)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# show only label 0 of the tsne plot\n",
+    "scatter = plt.scatter(X_tsne[y_train1 == 0, 0], X_tsne[y_train1 == 0, 1], c='red', label='Label 0')\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# show only the one with y == 0\n",
+    "# use tab10[0] color\n",
+    "# make it blend with add filter\n",
+    "\n",
+    "from sklearn.cluster import DBSCAN\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import matplotlib.patheffects as patheffects\n",
+    "import matplotlib.patches as mpatches\n",
+    "\n",
+    "def cluster_per_label(embedding, labels, targets, *, eps=0.8, min_samples=30):\n",
+    "    summary = {}\n",
+    "    plt.figure(figsize=(8, 5))\n",
+    "    cmap = plt.get_cmap(\"tab10\")\n",
+    "    legend_handles = []\n",
+    "    for target, _ in targets:\n",
+    "        subset = embedding[labels == target]\n",
+    "        if subset.size == 0:\n",
+    "            summary[target] = {}\n",
+    "            continue\n",
+    "\n",
+    "        model = DBSCAN(eps=eps, min_samples=min_samples).fit(subset)\n",
+    "        cluster_ids = model.labels_\n",
+    "\n",
+    "        valid_clusters = [cid for cid in np.unique(cluster_ids) if cid != -1]\n",
+    "        counts = {cid: int(np.sum(cluster_ids == cid)) for cid in valid_clusters}\n",
+    "        summary[target] = counts\n",
+    "\n",
+    "        label_color = cmap(target % cmap.N)\n",
+    "\n",
+    "        colors = np.array([\n",
+    "            label_color if cid != -1 else (0.8, 0.8, 0.8, 0.3)\n",
+    "            for cid in cluster_ids\n",
+    "        ])\n",
+    "\n",
+    "        scatter = plt.scatter(\n",
+    "            subset[:, 0],\n",
+    "            subset[:, 1],\n",
+    "            c=colors,\n",
+    "            alpha=0.01\n",
+    "        )\n",
+    "        legend_handles.append(\n",
+    "            mpatches.Patch(color=label_color, label=f\"Damage {target} (clusters={len(valid_clusters)})\")\n",
+    "        )\n",
+    "        for cid in valid_clusters:\n",
+    "            cluster_pts = subset[cluster_ids == cid]\n",
+    "            center = cluster_pts.mean(axis=0)\n",
+    "            txt = plt.text(\n",
+    "                center[0],\n",
+    "                center[1],\n",
+    "                f\"{counts[cid]}\",\n",
+    "                color=label_color,\n",
+    "                fontsize=9,\n",
+    "                fontweight=\"bold\",\n",
+    "                ha=\"left\",\n",
+    "                va=\"bottom\",\n",
+    "            )\n",
+    "            txt.set_path_effects([\n",
+    "                patheffects.withStroke(linewidth=1.8, foreground=\"white\")\n",
+    "            ])\n",
+    "    plt.xlabel(\"PaCMAP-1\")\n",
+    "    plt.ylabel(\"PaCMAP-2\")\n",
+    "    plt.legend(handles=legend_handles, bbox_to_anchor=(1.05, 1), loc='upper left')\n",
+    "    plt.tight_layout()\n",
+    "    plt.figure(dpi=600)\n",
+    "    # move legends outside the plot\n",
+    "    plt.show()\n",
+    "    return summary\n",
+    "\n",
+    "cluster_counts = cluster_per_label(\n",
+    "    X_pacmap,\n",
+    "    y_train1,\n",
+    "    targets=[(0,0),(1,1), (2,2), (3,3), (4,4), (5,5), (6,6)],\n",
+    "    eps=0.8,\n",
+    "    min_samples=30,\n",
+    ")\n",
+    "\n",
+    "for label_id, counts in cluster_counts.items():\n",
+    "    print(f\"Damage {label_id}\")\n",
+    "    if counts:\n",
+    "        for cluster_id, n_points in counts.items():\n",
+    "            print(f\"  Cluster {cluster_id}: {n_points} points\")\n",
+    "    else:\n",
+    "        print(\"  No clusters detected (all noise)\")\n",
+    "\n",
+    "print(cluster_counts)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "scaler = StandardScaler()\n",
+    "X_scaled = scaler.fit_transform(X1a)  \n",
+    "\n",
+    "pca = PCA(n_components=50).fit_transform(X_scaled)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "embedding = pacmap.PaCMAP(n_components=2, n_neighbors=10, MN_ratio=0.5, FP_ratio=2.0, random_state=10)\n",
+    "X_pacmap = embedding.fit_transform(pca)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "from sklearn.cluster import KMeans\n",
+    "import numpy as np\n",
+    "from collections import Counter\n",
+    "\n",
+    "# Suppose X_embedded is your 2D PaCMAP / t-SNE / UMAP embedding\n",
+    "kmeans = KMeans(n_clusters=40, random_state=42)  # choose k\n",
+    "clusters = kmeans.fit_predict(X_pacmap)\n",
+    "\n",
+    "counts = Counter(clusters)\n",
+    "print(counts)\n",
+    "\n",
+    "plt.figure(figsize=(10,8))\n",
+    "sc = plt.scatter(X_pacmap[:, 0], X_pacmap[:, 1], c=clusters, cmap=\"tab20\", s=10)\n",
+    "cbar = plt.colorbar(sc, ticks=range(kmeans.n_clusters))\n",
+    "# Label clusters with counts at their centroids\n",
+    "centers = kmeans.cluster_centers_\n",
+    "for i, (x, y) in enumerate(centers):\n",
+    "    plt.text(x, y, str(counts[i]), fontsize=12, weight='bold',\n",
+    "             ha='left', va='bottom', color='black',\n",
+    "             bbox=dict(facecolor='white', alpha=0, edgecolor='none'))\n",
+    "\n",
+    "plt.title(\"Clusters with counts\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Add Parameters Grid"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Coarse Grid Search"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "param_grid = [\n",
+    "    {   \n",
+    "        \"svc\": [SVC()],\n",
+    "        \"svc__kernel\": [\"rbf\"],\n",
+    "        \"svc__C\": np.exp2(np.arange(-5, 16, 5)),\n",
+    "        \"svc__gamma\": np.exp2(np.arange(-15, 6, 5)),\n",
+    "        \"pca__n_components\": [512, 256, 128, 64, 32, 16, 8, 4]\n",
+    "    },\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define models for sensor\n",
+    "models_sensor = {\n",
     "    # \"Random Forest\": RandomForestClassifier(),\n",
     "    # \"Bagged Trees\": BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=10),\n",
     "    # \"Decision Tree\": DecisionTreeClassifier(),\n",
     "    # \"KNN\": KNeighborsClassifier(),\n",
     "    # \"LDA\": LinearDiscriminantAnalysis(),\n",
     "    # \"SVM\": make_pipeline(\n",
-    "    #     StandardScaler(),\n",
-    "    #     SVC(kernel='rbf', probability=True)\n",
+    "    #     SVC(kernel='rbf')\n",
     "    # ),\n",
-    "    \"SVM with StandardScaler and PCA\": make_pipeline(\n",
-    "                                        StandardScaler(),\n",
-    "                                        PCA(n_components=10),\n",
-    "                                        SVC(kernel='rbf', probability=True)\n",
-    "                                        ),\n",
+    "    # \"Grid SVM\": GridSearchCV(\n",
+    "    #                     Pipeline([\n",
+    "    #                             (\"scaler\", StandardScaler()),\n",
+    "    #                             (\"svc\", SVC())\n",
+    "    #                             ]),\n",
+    "    #                     cv=cv,\n",
+    "    #                     param_grid=param_grid,\n",
+    "    #                     n_jobs=-1,\n",
+    "    #                     scoring='accuracy',\n",
+    "    #                     verbose=2\n",
+    "    #                     ),\n",
+    "\n",
+    "    \"Grid+SVM+StandardScaler+PCA\": GridSearchCV(\n",
+    "                                        Pipeline([\n",
+    "                                                (\"scaler\", StandardScaler()),\n",
+    "                                                (\"pca\", PCA()),\n",
+    "                                                (\"svc\", SVC())\n",
+    "                                                ]),\n",
+    "                                        cv=cv,\n",
+    "                                        param_grid=param_grid,\n",
+    "                                        n_jobs=-1,\n",
+    "                                        scoring='accuracy',\n",
+    "                                        verbose=4\n",
+    "                                    ),\n",
     "\n",
     "    # \"XGBoost\": XGBClassifier()\n",
-    "    \"MLPClassifier\": make_pipeline(\n",
-    "        StandardScaler(),\n",
-    "        MLPClassifier(hidden_layer_sizes=(1, 10), max_iter=500, random_state=42)\n",
-    "    )\n",
+    "    # \"MLPClassifier\": make_pipeline(\n",
+    "    #     StandardScaler(),\n",
+    "    #     MLPClassifier(hidden_layer_sizes=(1, 10), max_iter=500, random_state=42)\n",
+    "    # )\n",
     "}"
    ]
   },
@@ -730,7 +1269,7 @@
    "outputs": [],
    "source": [
     "results_sensor1 = []\n",
-    "for name, model in models_sensor1.items():\n",
+    "for name, model in models_sensor.items():\n",
     "    res = train_and_evaluate_model(model, name, \"Sensor A\", x_train1, y_train1, x_test1, y_test1, \n",
     "                                   export='D:/thesis/models/Sensor A')\n",
     "    results_sensor1.append(res)\n",
@@ -740,6 +1279,1187 @@
     "results_sensor1"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results_sensor2 = []\n",
+    "for name, model in models_sensor.items():\n",
+    "    res = train_and_evaluate_model(model, name, \"Sensor B\", x_train2, y_train2, x_test2, y_test2, \n",
+    "                                   export='D:/thesis/models/Sensor B')\n",
+    "    results_sensor2.append(res)\n",
+    "    print(f\"{name} on sensor2: Accuracy = {res['accuracy']:.2f}%\")\n",
+    "\n",
+    "# Display result\n",
+    "results_sensor2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from joblib import load\n",
+    "from sklearn.model_selection import GridSearchCV\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "\n",
+    "model: GridSearchCV = load('D:/thesis/models/Sensor B/Grid+SVM+StandardScaler+PCA.joblib')\n",
+    "df = model.cv_results_\n",
+    "df = pd.DataFrame(df)\n",
+    "df\n",
+    "# change column \"param_svc__C\" to np.log2\n",
+    "# df['param_svc__C'] = np.log2(df['param_svc__C'])\n",
+    "# df['param_svc__gamma'] = np.log2(df['param_svc__gamma'])\n",
+    "\n",
+    "# for i in df['param_pca__n_components'].unique():\n",
+    "#     # show the other columns where mean_test_score is max\n",
+    "#     idx = df[df['param_pca__n_components'] == i]['mean_test_score'].idxmax()\n",
+    "\t\n",
+    "#     print(i, \":\", df.iloc[idx][\"param_svc__C\"], df.iloc[idx][\"param_svc__gamma\"], df.iloc[idx][\"mean_test_score\"], df.iloc[idx][\"mean_fit_time\"], df.iloc[idx][\"mean_score_time\"])\n",
+    "\n",
+    "# Get rows where param_pca__n_components is 32\n",
+    "# result = df[df['param_pca__n_components'] == 32]\n",
+    "# top 10 most fit time\n",
+    "result.nlargest(10, 'mean_test_score')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Heatmap"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# plot contour plot of mean_test_score with x=param_svc__C and y=param_svc__gamma for each param_pca__n_components\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import seaborn as sns\n",
+    "norm = plt.Normalize(vmin=0, vmax=1)\n",
+    "for i in df['param_pca__n_components'].unique():\n",
+    "    subset = df[df['param_pca__n_components'] == i]\n",
+    "    pivot_table = subset.pivot(index='param_svc__gamma', columns='param_svc__C', values='mean_test_score')\n",
+    "    plt.figure(figsize=(8, 6), dpi=300)\n",
+    "    sns.heatmap(pivot_table, annot=True, fmt=\".4f\", cmap='RdYlGn', norm=norm)\n",
+    "    plt.xlabel('C (log2 scale)')\n",
+    "    plt.ylabel('Gamma (log2 scale)')\n",
+    "    plt.xticks(ticks=np.arange(len(pivot_table.columns)) + 0.5, labels=[f\"{np.log2(c):.0f}\" for c in pivot_table.columns])\n",
+    "    plt.yticks(ticks=np.arange(len(pivot_table.index)) + 0.5, labels=[f\"{np.log2(g):.0f}\" for g in pivot_table.index])\n",
+    "\n",
+    "    # Add outline for the heatmap cells\n",
+    "    for (j, k), val in np.ndenumerate(pivot_table.values):\n",
+    "        plt.gca().add_patch(plt.Rectangle((j, k), 1, 1, fill=False, edgecolor='black', lw=0.5))\n",
+    "\n",
+    "    # Highlight the cell with the max mean_test_score\n",
+    "    max_idx = subset['mean_test_score'].idxmax()\n",
+    "    max_C = subset.loc[max_idx, 'param_svc__C']\n",
+    "    max_gamma = subset.loc[max_idx, 'param_svc__gamma']\n",
+    "    max_x = np.where(pivot_table.columns == max_C)[0][0] + 0.5\n",
+    "    max_y = np.where(pivot_table.index == max_gamma)[0][0] + 0.5\n",
+    "    plt.gca().add_patch(plt.Rectangle((max_x-0.5, max_y-0.5), 1, 1, fill=False, edgecolor='red', lw=4))\n",
+    "\n",
+    "    # Save the figure for the current PCA component\n",
+    "    print(i)\n",
+    "    plt.savefig(f'D:/thesis/figures/Sensor B/grid_pca{i}.png', dpi=300)\n",
+    "    print(f\"Saved figure: D:/thesis/figures/Sensor B/grid_pca{i}.png\")\n",
+    "    plt.close()  # Close the figure to avoid overlapping figures in the next iteration"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Efficiency"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import seaborn as sns\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "# make param c and gamma to be log2 scale\n",
+    "df['param_svc__C'] = np.log2(df['param_svc__C']).astype(int)\n",
+    "df['param_svc__gamma'] = np.log2(df['param_svc__gamma']).astype(int)\n",
+    "# daata frame to include only highest mean_test_score for each param_pca__n_components\n",
+    "best_results = df.loc[df.groupby('param_pca__n_components')['mean_test_score'].idxmax()][['param_pca__n_components', 'param_svc__C', 'param_svc__gamma', 'mean_test_score', 'mean_fit_time']]\n",
+    "# add ratio of mean test score to mean fit time return in 10^-3\n",
+    "best_results['time_score_ratio'] = best_results['mean_test_score'] / best_results['mean_fit_time'] * 1e3\n",
+    "print(best_results.to_latex(float_format=\"%.5f\", index=False))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sns.lineplot(x=\"param_svc__C\", y=\"mean_test_score\", hue=\"param_svc__kernel\", data=df, marker=\"o\")\n",
+    "plt.xscale(\"log\", base=2)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df[\"max_diff\"] = df[[\"split0_test_score\",\"split1_test_score\",\n",
+    "                     \"split2_test_score\",\"split3_test_score\",\"split4_test_score\"]].max(axis=1) - \\\n",
+    "                 df[[\"split0_test_score\",\"split1_test_score\",\n",
+    "                     \"split2_test_score\",\"split3_test_score\",\"split4_test_score\"]].min(axis=1)\n",
+    "\n",
+    "sns.scatterplot(x=\"mean_test_score\", y=\"max_diff\", hue=\"param_svc__kernel\", data=df)\n",
+    "plt.title(\"Stability vs Score\")\n",
+    "plt.show()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "\n",
+    "# ---------------------------\n",
+    "# Prep\n",
+    "# ---------------------------\n",
+    "def _prep(cv_df: pd.DataFrame):\n",
+    "    df = cv_df.copy()\n",
+    "    df = df.rename(columns={\n",
+    "        \"param_pca__n_components\": \"n\",\n",
+    "        \"param_svc__kernel\": \"kernel\",\n",
+    "        \"param_svc__C\": \"C\",\n",
+    "        \"param_svc__gamma\": \"gamma\",\n",
+    "        \"mean_test_score\": \"mean\",\n",
+    "        \"std_test_score\": \"std\",\n",
+    "        \"mean_fit_time\": \"fit_time\",\n",
+    "        \"mean_score_time\": \"score_time\"\n",
+    "    })\n",
+    "    # Numeric coercion (robust to object dtypes)\n",
+    "    for col in [\"C\", \"gamma\", \"mean\", \"std\", \"fit_time\", \"score_time\", \"n\"]:\n",
+    "        if col in df.columns:\n",
+    "            df[col] = pd.to_numeric(df[col], errors=\"coerce\")\n",
+    "    # Count CV splits from split* columns\n",
+    "    split_cols = [c for c in df.columns if c.startswith(\"split\") and c.endswith(\"_test_score\")]\n",
+    "    n_splits = max(1, len(split_cols))\n",
+    "    # Derived metrics\n",
+    "    df[\"se\"] = df[\"std\"] / np.sqrt(n_splits)\n",
+    "    df[\"total_time\"] = df[\"fit_time\"] + df[\"score_time\"]\n",
+    "    df[\"lcb\"] = df[\"mean\"] - 1.96 * df[\"se\"]  # 95% lower confidence bound\n",
+    "    return df, n_splits\n",
+    "\n",
+    "# ---------------------------\n",
+    "# Pareto mask (maximize 'score_col', minimize 'time_col')\n",
+    "# ---------------------------\n",
+    "def _pareto_mask(d: pd.DataFrame, score_col=\"lcb\", time_col=\"total_time\") -> pd.Series:\n",
+    "    # A point is NON-dominated if no other point is <= time and >= score\n",
+    "    pts = d[[time_col, score_col]].to_numpy()\n",
+    "    n = len(d)\n",
+    "    is_nd = np.ones(n, dtype=bool)\n",
+    "    for i in range(n):\n",
+    "        if not is_nd[i]:\n",
+    "            continue\n",
+    "        t_i, s_i = pts[i]\n",
+    "        dominated_by_any = (\n",
+    "            ((pts[:,0] <= t_i) & (pts[:,1] >= s_i)) &\n",
+    "            ((pts[:,0] <  t_i) | (pts[:,1] >  s_i))\n",
+    "        )\n",
+    "        dominated_by_any[i] = False  # ignore self\n",
+    "        if np.any(dominated_by_any):\n",
+    "            is_nd[i] = False\n",
+    "    return pd.Series(is_nd, index=d.index)\n",
+    "\n",
+    "# ---------------------------\n",
+    "# Boundary (edge-of-grid) check\n",
+    "# ---------------------------\n",
+    "def _edge_flags(row, uniques):\n",
+    "    def edge(val, arr):\n",
+    "        arr = np.asarray(sorted(x for x in arr if pd.notnull(x)))\n",
+    "        if len(arr) == 0 or pd.isnull(val):\n",
+    "            return False\n",
+    "        return np.isclose(val, arr.min()) or np.isclose(val, arr.max())\n",
+    "    return {\n",
+    "        \"edge_C\": edge(row[\"C\"], uniques.get(\"C\", [])),\n",
+    "        \"edge_gamma\": edge(row[\"gamma\"], uniques.get(\"gamma\", [])),\n",
+    "        \"edge_n\": edge(row[\"n\"], uniques.get(\"n\", [])),\n",
+    "    }\n",
+    "\n",
+    "# ---------------------------\n",
+    "# Main analyzer\n",
+    "# ---------------------------\n",
+    "def analyze_efficiency(\n",
+    "    cv_df: pd.DataFrame,\n",
+    "    w_fit: float = 1.0,\n",
+    "    w_score: float = 1.0,\n",
+    "    use_lcb: bool = True\n",
+    "):\n",
+    "    \"\"\"\n",
+    "    Returns:\n",
+    "      summary_by_n: DataFrame (one picked row per n with diagnostics)\n",
+    "      pareto_by_n: dict[n] -> DataFrame of Pareto frontier candidates\n",
+    "      picks_by_n: dict[n] -> Series (the picked row)\n",
+    "      global_reco: Series (best overall by efficiency)\n",
+    "    \"\"\"\n",
+    "    df, n_splits = _prep(cv_df)\n",
+    "\n",
+    "    # Allow custom weighting of fit vs score time (e.g., inference matters more)\n",
+    "    df[\"total_time\"] = w_fit * df[\"fit_time\"] + w_score * df[\"score_time\"]\n",
+    "    score_col = \"lcb\" if use_lcb else \"mean\"\n",
+    "\n",
+    "    # Cache unique grids for edge-of-grid flags\n",
+    "    uniques = {\n",
+    "        \"C\": df[\"C\"].unique(),\n",
+    "        \"gamma\": df[\"gamma\"].unique(),\n",
+    "        \"n\": df[\"n\"].unique()\n",
+    "    }\n",
+    "\n",
+    "    pareto_by_n, picks_by_n, rows = {}, {}, []\n",
+    "\n",
+    "    for n, g in df.groupby(\"n\"):\n",
+    "        g = g.dropna(subset=[\"total_time\", score_col]).copy()\n",
+    "        if g.empty:\n",
+    "            continue\n",
+    "\n",
+    "        # Pareto frontier within this n\n",
+    "        mask = _pareto_mask(g, score_col=score_col, time_col=\"total_time\")\n",
+    "        pareto = g.loc[mask].copy()\n",
+    "        pareto[\"efficiency\"] = pareto[score_col] / pareto[\"total_time\"]\n",
+    "\n",
+    "        # Choose the efficiency champion at this n\n",
+    "        pick = pareto.loc[pareto[\"efficiency\"].idxmax()].copy()\n",
+    "\n",
+    "        # Compare versus best-accuracy-at-n (not risk-adjusted)\n",
+    "        best_acc_row = g.loc[g[\"mean\"].idxmax()].copy()\n",
+    "        acc_loss_vs_best = best_acc_row[\"mean\"] - pick[\"mean\"]  # >= 0 means pick is slightly worse in accuracy\n",
+    "        speedup_vs_best = best_acc_row[\"total_time\"] / pick[\"total_time\"]  # >1 means pick is faster\n",
+    "\n",
+    "        # Edge-of-grid diagnostics\n",
+    "        flags = _edge_flags(pick, uniques)\n",
+    "\n",
+    "        # Pack summary row\n",
+    "        rows.append({\n",
+    "            \"n\": n,\n",
+    "            \"kernel\": pick.get(\"kernel\"),\n",
+    "            \"C\": pick.get(\"C\"),\n",
+    "            \"gamma\": pick.get(\"gamma\"),\n",
+    "            \"mean_score\": pick[\"mean\"],\n",
+    "            \"std_score\": pick[\"std\"],\n",
+    "            \"se_score\": pick[\"se\"],\n",
+    "            \"LCB_score\": pick[\"lcb\"],\n",
+    "            \"fit_time_s\": pick[\"fit_time\"],\n",
+    "            \"score_time_s\": pick[\"score_time\"],\n",
+    "            \"total_time_s\": pick[\"total_time\"],\n",
+    "            \"efficiency\": pick[\"efficiency\"],  # (LCB or mean)/sec\n",
+    "            \"acc_loss_vs_best_at_n\": acc_loss_vs_best,\n",
+    "            \"speedup_vs_best_at_n\": speedup_vs_best,\n",
+    "            \"pareto_size_at_n\": len(pareto),\n",
+    "            \"edge_C\": flags[\"edge_C\"],\n",
+    "            \"edge_gamma\": flags[\"edge_gamma\"],\n",
+    "            \"edge_n\": flags[\"edge_n\"],\n",
+    "        })\n",
+    "\n",
+    "        pareto_by_n[n] = (pareto\n",
+    "                          .sort_values([\"total_time\", score_col], ascending=[True, False])\n",
+    "                          .reset_index(drop=True))\n",
+    "        picks_by_n[n] = pick\n",
+    "\n",
+    "    summary_by_n = pd.DataFrame(rows).sort_values(\"efficiency\", ascending=False).reset_index(drop=True)\n",
+    "\n",
+    "    # Best overall by efficiency\n",
+    "    global_reco = None\n",
+    "    if not summary_by_n.empty:\n",
+    "        global_reco = summary_by_n.loc[summary_by_n[\"efficiency\"].idxmax()]\n",
+    "\n",
+    "    return summary_by_n, pareto_by_n, picks_by_n, global_reco\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# summary_by_n: one recommended (most efficient) config per PCA n_components\n",
+    "# pareto_by_n[n]: the Pareto frontier at that n (all strong time/accuracy trade-offs)\n",
+    "summary_by_n, pareto_by_n, picks_by_n, global_reco = analyze_efficiency(df)\n",
+    "\n",
+    "print(\"Top efficiency picks per n_components:\")\n",
+    "display(summary_by_n.head(20))  # or print(summary_by_n.to_string(index=False))\n",
+    "\n",
+    "print(\"\\nMost efficient configuration overall:\")\n",
+    "print(global_reco.to_string())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "def plot_time_vs_score_for_n(cv_df, n, use_lcb=True):\n",
+    "    df, _ = _prep(cv_df)\n",
+    "    score_col = \"lcb\" if use_lcb else \"mean\"\n",
+    "    df[\"total_time\"] = df[\"fit_time\"] + df[\"score_time\"]\n",
+    "\n",
+    "    g = df[df[\"n\"] == n].dropna(subset=[\"total_time\", score_col]).copy()\n",
+    "    if g.empty:\n",
+    "        print(f\"No rows for n={n}\")\n",
+    "        return\n",
+    "\n",
+    "    mask = _pareto_mask(g, score_col=score_col, time_col=\"total_time\")\n",
+    "    pareto = g.loc[mask].copy()\n",
+    "\n",
+    "    plt.figure(figsize=(7, 4.5))\n",
+    "    plt.scatter(g[\"total_time\"], g[score_col], alpha=0.3, s=16)\n",
+    "    p_sorted = pareto.sort_values([\"total_time\", score_col], ascending=[True, False])\n",
+    "    plt.plot(p_sorted[\"total_time\"], p_sorted[score_col], marker=\"o\")\n",
+    "    plt.xlabel(\"Total time (fit + score, s)\")\n",
+    "    plt.ylabel(\"CV score\" + (\" (LCB)\" if use_lcb else \" (mean)\"))\n",
+    "    plt.title(f\"Time vs Score with Pareto frontier — n={n}\")\n",
+    "    plt.grid(True, alpha=0.3)\n",
+    "    plt.tight_layout()\n",
+    "    plt.show()\n",
+    "\n",
+    "# Example usage:\n",
+    "plot_time_vs_score_for_n(df, n=128, use_lcb=True)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(df.iloc[np.argmax(df['mean_fit_time'])]['params'])  # Check the highest mean fit time"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import seaborn as sns\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import matplotlib as mpl\n",
+    "\n",
+    "# Pivot table for contour plot with log2 transformation\n",
+    "pivot = df.pivot(\n",
+    "    index='param_svc__C', \n",
+    "    columns='param_svc__gamma', \n",
+    "    values=\"mean_test_score\"\n",
+    ")\n",
+    "\n",
+    "# Create new log2-transformed indices and columns\n",
+    "log2_columns = np.log2(pivot.columns)\n",
+    "log2_indices = np.log2(pivot.index)\n",
+    "\n",
+    "\n",
+    "# Create a contour plot using log2-transformed data\n",
+    "plt.figure(figsize=(8, 6))\n",
+    "X, Y = np.meshgrid(log2_columns, log2_indices)\n",
+    "Z = pivot.values\n",
+    "\n",
+    "levels = np.linspace(Z.min(), Z.max(), 10)  # Adjust the number of levels as needed\n",
+    "levels = [0.9, 0.92, 0.96, 0.98, 0.99]\n",
+    "# Create filled contours\n",
+    "# contour = plt.contourf(X, Y, Z, levels=levels, cmap=\"viridis\")\n",
+    "# plt.colorbar(contour, label=\"Mean Test Score (Accuracy)\")\n",
+    "\n",
+    "# Add contour lines\n",
+    "contour_lines = plt.contour(X, Y, Z, levels=levels, cmap='Dark2', linewidths=0.5)\n",
+    "plt.clabel(contour_lines, inline=True, fontsize=8, fmt=\"%.4f\")\n",
+    "\n",
+    "# Set axis labels and title\n",
+    "plt.title(\"GridSearchCV Mean Test Score (Accuracy)\")\n",
+    "plt.xlabel(\"log₂(gamma)\")\n",
+    "plt.ylabel(\"log₂(C)\")\n",
+    "\n",
+    "# Since we're already using log2 values, no need to transform tick labels\n",
+    "plt.xticks(X[0], [f\"{x:.1f}\" for x in X[0]])\n",
+    "plt.yticks(Y[:,0], [f\"{y:.1f}\" for y in Y[:,0]])\n",
+    "\n",
+    "print(plt.gca().get_yticklabels())\n",
+    "print(plt.gca().get_xticklabels())\n",
+    "\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##### Evaluation Baseline on Dataset B"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from src.ml.model_selection import create_ready_data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X2a, y = create_ready_data('D:/thesis/data/converted/raw/sensor2')\n",
+    "\n",
+    "# Display DataFrame\n",
+    "# pd.concat([\n",
+    "#            pd.DataFrame(y, columns=['label']),   # Labels\n",
+    "#            X2a,                                 # Features\n",
+    "#            ], \n",
+    "#            axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# train svm with best params from gridsearchcv\n",
+    "from sklearn.pipeline import Pipeline\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "from sklearn.svm import SVC\n",
+    "from sklearn.decomposition import PCA\n",
+    "from sklearn.metrics import classification_report, confusion_matrix\n",
+    "from joblib import dump, load\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearnex import patch_sklearn\n",
+    "patch_sklearn()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Retrain the model on the entire dataset\n",
+    "final_model_baseline = Pipeline([\n",
+    "    (\"scaler\", StandardScaler()),\n",
+    "    (\"pca\", PCA(n_components=32)),\n",
+    "    (\"svc\", SVC(C=2**10, gamma=2**-10, kernel='rbf'))\n",
+    "])\n",
+    "\n",
+    "# Fit the model on the entire dataset\n",
+    "final_model_baseline.fit(X1a, y)\n",
+    "\n",
+    "# Save the final model\n",
+    "from joblib import dump\n",
+    "dump(final_model_baseline, \"D:/thesis/models/Sensor A/baseline_pca32_c10_g-10.joblib\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X1b, y1b = create_ready_data('D:/thesis/data/converted/raw_B/sensor1')\n",
+    "X2b, y2b = create_ready_data('D:/thesis/data/converted/raw_B/sensor2')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# svm_model = load('D:/thesis/models/sensor2/SVM.joblib')\n",
+    "model1a = load(\"D:/thesis/models/Sensor A/baseline_pca32_c10_g-10.joblib\")\n",
+    "y_pred_model1a = model1a.predict(X1b)\n",
+    "model2a = load(\"D:/thesis/models/Sensor B/baseline_pca16_c5_g-5.joblib\")\n",
+    "y_pred_model2a = model2a.predict(X2b)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"Classification Report for Sensor A Baseline Model:\")\n",
+    "print(pd.DataFrame(classification_report(y2b, y_pred_model2a, output_dict=True)).transpose().to_latex(index=True, float_format=\"%.2f\"))\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import seaborn as sns\n",
+    "from sklearn.metrics import confusion_matrix\n",
+    "cm = confusion_matrix(y2b, y_pred_model2a, labels=[0,1,2,3,4,5,6])\n",
+    "plt.figure(figsize=(8, 6), dpi=300)\n",
+    "sns.heatmap(cm, annot=True, fmt=\"d\", cmap=\"Blues\", cbar=False)\n",
+    "plt.xlabel(\"Predicted Label\")\n",
+    "plt.ylabel(\"True Label\")\n",
+    "# plt.title(\"Confusion Matrix for Sensor A Baseline Model\")\n",
+    "# add cells outline\n",
+    "for i in range(cm.shape[0]):\n",
+    "    for j in range(cm.shape[1]):\n",
+    "        plt.gca().add_patch(plt.Rectangle((j, i), 1, 1, fill=False, edgecolor='black', lw=0.5))\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Fine Grid Search"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "param_grid = [\n",
+    "    {   \n",
+    "        \"svc\": [SVC()],\n",
+    "        \"svc__kernel\": [\"rbf\"],\n",
+    "        \"svc__C\": np.logspace(3, 7, 9, base=2),\n",
+    "        \"svc__gamma\": np.logspace(-7, -3, 9, base=2)\n",
+    "    },\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sklearnex\n",
+    "sklearnex.patch_sklearn()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "models_sensor = {\n",
+    "    # \"Random Forest\": RandomForestClassifier(),\n",
+    "    # \"Bagged Trees\": BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=10),\n",
+    "    # \"Decision Tree\": DecisionTreeClassifier(),\n",
+    "    # \"KNN\": KNeighborsClassifier(),\n",
+    "    # \"LDA\": LinearDiscriminantAnalysis(),\n",
+    "    # \"SVM\": make_pipeline(\n",
+    "    #     SVC(kernel='rbf')\n",
+    "    # ),\n",
+    "    # \"Grid SVM\": GridSearchCV(\n",
+    "    #                     Pipeline([\n",
+    "    #                             (\"scaler\", StandardScaler()),\n",
+    "    #                             (\"svc\", SVC())\n",
+    "    #                             ]),\n",
+    "    #                     cv=cv,\n",
+    "    #                     param_grid=param_grid,\n",
+    "    #                     n_jobs=-1,\n",
+    "    #                     scoring='accuracy',\n",
+    "    #                     verbose=2\n",
+    "    #                     ),\n",
+    "\n",
+    "    \"FineGrid+SVM+StandardScaler+PCA\": GridSearchCV(\n",
+    "                                        Pipeline([\n",
+    "                                                (\"scaler\", StandardScaler()),\n",
+    "                                                (\"pca\", PCA(n_components=16)),\n",
+    "                                                (\"svc\", SVC())\n",
+    "                                                ]),\n",
+    "                                        cv=cv,\n",
+    "                                        param_grid=param_grid,\n",
+    "                                        n_jobs=-1,\n",
+    "                                        scoring='accuracy',\n",
+    "                                        verbose=4\n",
+    "                                    ),\n",
+    "\n",
+    "    # \"XGBoost\": XGBClassifier()\n",
+    "    # \"MLPClassifier\": make_pipeline(\n",
+    "    #     StandardScaler(),\n",
+    "    #     MLPClassifier(hidden_layer_sizes=(1, 10), max_iter=500, random_state=42)\n",
+    "    # )\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results_sensor2 = []\n",
+    "for name, model in models_sensor.items():\n",
+    "    res = train_and_evaluate_model(model, name, \"Sensor B\", x_train2, y_train2, x_test2, y_test2, \n",
+    "                                   export='D:/thesis/models/Sensor B')\n",
+    "    results_sensor2.append(res)\n",
+    "    print(f\"{name} on sensor2: Accuracy = {res['accuracy']:.2f}%\")\n",
+    "\n",
+    "# Display result\n",
+    "results_sensor2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from joblib import load\n",
+    "\n",
+    "model: GridSearchCV = load('D:/thesis/models/Sensor A/FineGrid+SVM+StandardScaler+PCA.joblib')\n",
+    "df = model.cv_results_\n",
+    "df = pd.DataFrame(df)\n",
+    "df['param_pca__n_components'] = 32\n",
+    "# df['param_pca__n_components'].unique()\n",
+    "# df['param_svc__C'] = np.log2(df['param_svc__C']).astype(int)\n",
+    "# df['param_svc__gamma'] = np.log2(df['param_svc__gamma']).astype(int)\n",
+    "# df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "# turn param_svc__C and param_svc__gamma to log2 scale\n",
+    "\n",
+    "# df.iloc[np.argmax(df['mean_test_score'])]\n",
+    "df.nlargest(10, 'mean_test_score')\n",
+    "\n",
+    "# add best c and gamma for sensor B"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import matplotlib as mpl\n",
+    "\n",
+    "# Pivot table for contour plot with log2 transformation\n",
+    "pivot = df.pivot(\n",
+    "    index='param_svc__C', \n",
+    "    columns='param_svc__gamma', \n",
+    "    values=\"mean_test_score\"\n",
+    ")\n",
+    "\n",
+    "# Create new log2-transformed indices and columns\n",
+    "log2_columns = np.log2(pivot.columns)\n",
+    "log2_indices = np.log2(pivot.index)\n",
+    "\n",
+    "\n",
+    "# Create a contour plot using log2-transformed data\n",
+    "plt.figure(figsize=(8, 6))\n",
+    "X, Y = np.meshgrid(log2_columns, log2_indices)\n",
+    "Z = pivot.values\n",
+    "\n",
+    "levels = np.linspace(0.6, Z.max(), 200)  # Adjust the number of levels as needed\n",
+    "levels = [0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.978, 0.979, 0.9792]\n",
+    "# Create filled contours\n",
+    "# contour = plt.contourf(X, Y, Z, levels=levels, cmap=\"viridis\")\n",
+    "# plt.colorbar(contour, label=\"Mean Test Score (Accuracy)\")\n",
+    "\n",
+    "# Add contour lines\n",
+    "contour_lines = plt.contour(X, Y, Z, levels=levels, cmap='Dark2', linewidths=0.5)\n",
+    "plt.clabel(contour_lines, inline=True, fontsize=8, fmt=\"%.4f\")\n",
+    "\n",
+    "# Set axis labels and title\n",
+    "plt.title(\"GridSearchCV Mean Test Score (Accuracy)\")\n",
+    "plt.xlabel(\"log₂(gamma)\")\n",
+    "plt.ylabel(\"log₂(C)\")\n",
+    "\n",
+    "# Since we're already using log2 values, no need to transform tick labels\n",
+    "plt.xticks(X[0], [f\"{x:.1f}\" for x in X[0]])\n",
+    "plt.yticks(Y[:,0], [f\"{y:.1f}\" for y in Y[:,0]])\n",
+    "\n",
+    "print(plt.gca().get_yticklabels())\n",
+    "print(plt.gca().get_xticklabels())\n",
+    "\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Heatmap"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# plot contour plot of mean_test_score with x=param_svc__C and y=param_svc__gamma for each param_pca__n_components\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import seaborn as sns\n",
+    "norm = plt.Normalize(vmin=0.99, vmax=1)\n",
+    "for i in df['param_pca__n_components'].unique():\n",
+    "     subset = df[df['param_pca__n_components'] == i]\n",
+    "     pivot_table = subset.pivot(index='param_svc__gamma', columns='param_svc__C', values='mean_test_score')\n",
+    "     plt.figure(figsize=(8, 6), dpi=300)\n",
+    "     sns.heatmap(pivot_table, annot=True, fmt=\".4f\", cmap='RdYlGn', norm=norm, cbar=False)\n",
+    "     # plt.title(f'Grid Search Mean Test Score (PCA components={i})')\n",
+    "     plt.xlabel('C (log2 scale)')\n",
+    "     plt.ylabel('Gamma (log2 scale)')\n",
+    "     # plt.xscale('log', base=2)\n",
+    "     # plt.yscale('log', base=2)\n",
+    "     # set x and y ticks by converting the number into log base 2\n",
+    "#      plt.xticks(ticks=np.arange(len(pivot_table.columns)), labels=[f\"{np.log2(c):.1f}\" for c in pivot_table.columns])\n",
+    "#      plt.yticks(ticks=np.arange(len(pivot_table.index)), labels=[f\"{np.log2(g):.1f}\" for g in pivot_table.index])\n",
+    "\n",
+    "     # make the mark ticks to be at the center of each cell range\n",
+    "     plt.xticks(ticks=np.arange(len(pivot_table.columns)) + 0.5, labels=[f\"{np.log2(c):.1f}\" for c in pivot_table.columns], rotation=0)\n",
+    "     plt.yticks(ticks=np.arange(len(pivot_table.index)) + 0.5, labels=[f\"{np.log2(g):.1f}\" for g in pivot_table.index])\n",
+    "\n",
+    "     # add outline for the heatmap cells\n",
+    "     for (i, j), val in np.ndenumerate(pivot_table.values):\n",
+    "           plt.gca().add_patch(plt.Rectangle((j, i), 1, 1, fill=False, edgecolor='black', lw=0.5))\n",
+    "\n",
+    "     # give mark by outline the heatmap cells with max mean_test_score\n",
+    "     max_idx = subset['mean_test_score'].idxmax()\n",
+    "     max_C = subset.loc[max_idx, 'param_svc__C']\n",
+    "     max_gamma = subset.loc[max_idx, 'param_svc__gamma']\n",
+    "     max_x = np.where(pivot_table.columns == max_C)[0][0] + 0.5\n",
+    "     max_y = np.where(pivot_table.index == max_gamma)[0][0] + 0.5\n",
+    "     plt.gca().add_patch(plt.Rectangle((max_x-0.5, max_y-0.5), 1, 1, fill=False, edgecolor='red', lw=2))\n",
+    "     plt.tick_params(axis='x', length=0)  # Remove x-axis tick marks\n",
+    "     plt.tick_params(axis='y', length=0)  # Remove y-axis tick marks\n",
+    "     plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Evaluation on Fine Grid Search"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Retrain the model on the entire dataset\n",
+    "final_model_finegrid = Pipeline([\n",
+    "    (\"scaler\", StandardScaler()),\n",
+    "    (\"pca\", PCA(n_components=16)),\n",
+    "    (\"svc\", SVC(C=2**8, gamma=2**-8, kernel='rbf'))\n",
+    "])\n",
+    "\n",
+    "# Fit the model on the entire dataset\n",
+    "final_model_finegrid.fit(X1a, y)\n",
+    "\n",
+    "# Save the final model\n",
+    "from joblib import dump\n",
+    "dump(final_model_finegrid, \"D:/thesis/models/Sensor A/finegrid_pca32_c8_g-8.joblib\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# svm_model = load('D:/thesis/models/sensor2/SVM.joblib')\n",
+    "model1a = load(\"D:/thesis/models/Sensor A/finegrid_pca32_c8_g-8.joblib\")\n",
+    "y_pred_model1a = model1a.predict(X1b)\n",
+    "model2a = load(\"D:/thesis/models/Sensor B/finegrid_pca16_c3_g-5.5.joblib\")\n",
+    "y_pred_model2a = model2a.predict(X2b)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(pd.DataFrame(classification_report(y1b, y_pred_model1a, output_dict=True)).transpose().to_latex(index=True, float_format=\"%.2f\"))\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import seaborn as sns\n",
+    "from sklearn.metrics import confusion_matrix\n",
+    "cm = confusion_matrix(y1b, y_pred_model1a, labels=[0,1,2,3,4,5,6])\n",
+    "plt.figure(figsize=(8, 6), dpi=300)\n",
+    "sns.heatmap(cm, annot=True, fmt=\"d\", cmap=\"Blues\", cbar=False)\n",
+    "plt.xlabel(\"Predicted Label\")\n",
+    "plt.ylabel(\"True Label\")\n",
+    "# plt.title(\"Confusion Matrix for Sensor A Baseline Model\")\n",
+    "# add cells outline\n",
+    "for i in range(cm.shape[0]):\n",
+    "    for j in range(cm.shape[1]):\n",
+    "        plt.gca().add_patch(plt.Rectangle((j, i), 1, 1, fill=False, edgecolor='black', lw=0.5))\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### n_components accuracy vs fit time"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Scale features\n",
+    "scaler = StandardScaler()\n",
+    "x_train1_scaled = scaler.fit_transform(x_train1)\n",
+    "x_test1_scaled = scaler.transform(x_test1)\n",
+    "\n",
+    "x_train2_scaled = scaler.fit_transform(x_train2)\n",
+    "x_test2_scaled = scaler.transform(x_test2)\n",
+    "\n",
+    "# Generate n_components list: 512 → 256 → 128 → ...\n",
+    "max_dim = 512 # cap at dataset dimension\n",
+    "n_components_list = []\n",
+    "n = max_dim\n",
+    "while n >= 1:\n",
+    "    n_components_list.append(n)\n",
+    "    n //= 2   # halve each time"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "from sklearn.metrics import accuracy_score\n",
+    "\n",
+    "def perform_pca_accuracy_time(n_components_list, x_train, x_test, y_train, y_test):\n",
+    "    times = []\n",
+    "    accuracies = []\n",
+    "    metadata = {}\n",
+    "\n",
+    "    # Scale features\n",
+    "    scaler = StandardScaler()\n",
+    "    x_train_scaled = scaler.fit_transform(x_train)\n",
+    "    x_test_scaled = scaler.transform(x_test)\n",
+    "\n",
+    "    # Generate n_components list: 512 → 256 → 128 → ...\n",
+    "    max_dim = 512 # cap at dataset dimension\n",
+    "    n_components_list = []\n",
+    "    n = max_dim\n",
+    "    while n >= 1:\n",
+    "        n_components_list.append(n)\n",
+    "        n //= 2   # halve each time\n",
+    "    \n",
+    "    # Evaluate PCA + SVM for each n_components [1, 2, 4, ..., 512]\n",
+    "    for n in reversed(n_components_list):\n",
+    "        # PCA\n",
+    "        pca = PCA(n_components=n)\n",
+    "        X_train_pca = pca.fit_transform(x_train_scaled)\n",
+    "        X_test_pca = pca.transform(x_test_scaled)\n",
+    "\n",
+    "        # SVM\n",
+    "        clf = SVC(kernel=\"rbf\", gamma=GAMMA_BEST, C=C_BEST)\n",
+    "        start = time.time()\n",
+    "        clf.fit(X_train_pca, y_train)\n",
+    "        fit_time = time.time() - start\n",
+    "        \n",
+    "        y_pred = clf.predict(X_test_pca)\n",
+    "        acc = accuracy_score(y_test, y_pred)\n",
+    "        \n",
+    "        times.append(fit_time)\n",
+    "        accuracies.append(acc)\n",
+    "\n",
+    "        metadata[n] = {\n",
+    "            \"fit_time\": fit_time,\n",
+    "            \"accuracy\": acc\n",
+    "        }\n",
+    "    return metadata"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from mpl_toolkits.axes_grid1 import host_subplot\n",
+    "import numpy as np\n",
+    "\n",
+    "def plot_perform_pca_accuracy_time(metadata):\n",
+    "    host = host_subplot(111)\n",
+    "    par = host.twinx()\n",
+    "\n",
+    "    host.set_xlabel(\"n_components (log2 scale)\")\n",
+    "    host.set_ylabel(\"Fit time (s)\")\n",
+    "    par.set_ylabel(\"Accuracy\")\n",
+    "\n",
+    "    x = [np.log2(i) for i in list(metadata.keys())]\n",
+    "\n",
+    "    p1, = host.plot(x, [i[1]['fit_time'] for i in metadata.items()], label=\"Fit Time\")\n",
+    "    p2, = par.plot(x, [i[1]['accuracy'] for i in metadata.items()], label=\"Accuracy\")\n",
+    "\n",
+    "    # add grid lines\n",
+    "\n",
+    "    host.grid(True)\n",
+    "\n",
+    "    host.legend(labelcolor=\"linecolor\")\n",
+    "\n",
+    "    # rename x axis label with f\"np.exp2(x)\"\n",
+    "    # host.set_xticklabels([f\"{np.exp2(i):.0f}\" for i in x])\n",
+    "\n",
+    "    # resize x axis to include up to 10 with interval 1\n",
+    "    host.set_xticks(np.arange(min(x), max(x)+1, 1))\n",
+    "    host.set_ylim(0, max([i[1]['fit_time'] for i in metadata.items()])*1.1)\n",
+    "    par.set_ylim(0, 1.1)\n",
+    "\n",
+    "    # show both y value text on each point\n",
+    "    for i in range(len(x)):\n",
+    "        host.text(x[i], metadata[list(metadata.keys())[i]]['fit_time'], f\"{metadata[list(metadata.keys())[i]]['fit_time']:.1f}\", color=p1.get_color())\n",
+    "        par.text(x[i], metadata[list(metadata.keys())[i]]['accuracy'], f\"{metadata[list(metadata.keys())[i]]['accuracy']:.3f}\", color=p2.get_color())\n",
+    "\n",
+    "    host.yaxis.label.set_color(p1.get_color())\n",
+    "    par.yaxis.label.set_color(p2.get_color())\n",
+    "\n",
+    "    plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def ratio_fit_time_accuracy(metadata):\n",
+    "    ratios = {}\n",
+    "    for n, values in metadata.items():\n",
+    "        fit_time = values['fit_time']\n",
+    "        accuracy = values['accuracy']\n",
+    "        if accuracy != 0:\n",
+    "            ratio = fit_time / accuracy\n",
+    "        else:\n",
+    "            ratio = float('inf')  # Avoid division by zero\n",
+    "        ratios[n] = ratio\n",
+    "    return ratios"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Sensor A"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "metadata_A = perform_pca_accuracy_time(n_components_list, x_train1, x_test1, y_train1, y_test1) "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_perform_pca_accuracy_time(metadata_A)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ratios_A = ratio_fit_time_accuracy(metadata_A)\n",
+    "# scatter line\n",
+    "\n",
+    "plt.plot([np.log2(i) for i in ratios_A.keys()], ratios_A.values(), marker='o')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Sensor B"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "metadata_B = perform_pca_accuracy_time(n_components_list, x_train2, x_test2, y_train2, y_test2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_perform_pca_accuracy_time(metadata_B) # add ratio plot between fit time and acc"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# time in ms\n",
+    "import time\n",
+    "\n",
+    "start_time = time.time()\n",
+    "print(model.predict(x_test1.iloc[0:1, :]))\n",
+    "print(y_test1[0:1])\n",
+    "end_time = time.time()\n",
+    "print(f\"Prediction time: {(end_time - start_time) * 1000} ms\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(pd.DataFrame(results_sensor1[0][\"classification_report\"]).transpose().to_latex(index=True, float_format=\"%.2f\", caption=\"Classification report on Dataset B\", label=\"tab:clf_report_auto\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import matplotlib.pyplot as plt\n",
+    "import pandas as pd\n",
+    "from sklearn.decomposition import PCA\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "\n",
+    "\n",
+    "X = X1a\n",
+    "y = y\n",
+    "\n",
+    "# -----------------------\n",
+    "# PCA with 16 components\n",
+    "# (use min(n_features, 16) if fewer features)\n",
+    "# -----------------------\n",
+    "scaler = StandardScaler()\n",
+    "X_scaled = scaler.fit_transform(X)\n",
+    "\n",
+    "n_components = min(16, X.shape[1])  # safe guard\n",
+    "pca = PCA(n_components=n_components)\n",
+    "X_pca = pca.fit_transform(X_scaled)\n",
+    "\n",
+    "# Wrap into DataFrame\n",
+    "pc_df = pd.DataFrame(\n",
+    "    X_pca,\n",
+    "    columns=[f\"PC{i+1}\" for i in range(n_components)]\n",
+    ")\n",
+    "pc_df[\"target\"] = y\n",
+    "\n",
+    "# -----------------------\n",
+    "# Save scatter plots for each PC pair\n",
+    "# -----------------------\n",
+    "output_folder = \"D:/thesis/figures/pca_scatter_plots\"\n",
+    "os.makedirs(output_folder, exist_ok=True)\n",
+    "\n",
+    "for i in range(n_components):\n",
+    "    for j in range(i+1, n_components):\n",
+    "        plt.figure(figsize=(6, 5))\n",
+    "        for label in set(y):\n",
+    "            subset = pc_df[pc_df[\"target\"] == label]\n",
+    "            plt.scatter(\n",
+    "                subset[f\"PC{i+1}\"],\n",
+    "                subset[f\"PC{j+1}\"],\n",
+    "                label=f\"Class {label}\",\n",
+    "                alpha=0.6\n",
+    "            )\n",
+    "        plt.xlabel(f\"PC{i+1} ({pca.explained_variance_ratio_[i]:.2%})\")\n",
+    "        plt.ylabel(f\"PC{j+1} ({pca.explained_variance_ratio_[j]:.2%})\")\n",
+    "        plt.title(f\"PCA Scatter: PC{i+1} vs PC{j+1}\")\n",
+    "        plt.legend()\n",
+    "        plt.tight_layout()\n",
+    "\n",
+    "        # Save to folder\n",
+    "        filename = f\"PC{i+1}_vs_PC{j+1}.png\"\n",
+    "        plt.savefig(os.path.join(output_folder, filename))\n",
+    "        plt.close()\n",
+    "\n",
+    "print(f\"All pairwise plots saved to folder: {output_folder}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "param_grid_B = [\n",
+    "    {   \n",
+    "        \"svc\": [SVC()],\n",
+    "        \"svc__kernel\": [\"rbf\"],\n",
+    "         \"svc__C\": np.exp2(np.arange(-5, 16, 5)),\n",
+    "        \"svc__gamma\": np.exp2(np.arange(-15, 6, 5)),\n",
+    "        \"pca__n_components\": [512, 256, 128, 64, 32, 16, 8, 4]\n",
+    "    },\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cv_B = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -752,12 +2472,18 @@
     "    # \"Decision Tree\": DecisionTreeClassifier(),\n",
     "    # \"KNN\": KNeighborsClassifier(),\n",
     "    # \"LDA\": LinearDiscriminantAnalysis(),\n",
-    "    \"SVM\": SVC(),\n",
-    "    # \"SVM with StandardScaler and PCA\": make_pipeline(\n",
-    "    # StandardScaler(),\n",
-    "    # PCA(n_components=10),\n",
-    "    # SVC(kernel='rbf')\n",
-    "    # ),\n",
+    "    \"Grid+SVM+StandardScaler+PCA\": GridSearchCV(\n",
+    "                                        Pipeline([\n",
+    "                                                (\"scaler\", StandardScaler()),\n",
+    "                                                (\"pca\", PCA()),\n",
+    "                                                (\"svc\", SVC())\n",
+    "                                                ]),\n",
+    "                                        cv=cv_B,\n",
+    "                                        param_grid=param_grid_B,\n",
+    "                                        n_jobs=-1,\n",
+    "                                        scoring='accuracy',\n",
+    "                                        verbose=4\n",
+    "                                    ),\n",
     "    # \"XGBoost\": XGBClassifier()\n",
     "}"
    ]
@@ -770,15 +2496,29 @@
    "source": [
     "results_sensor2 = []\n",
     "for name, model in models_sensor2.items():\n",
-    "    res = train_and_evaluate_model(model, name, \"sensor2\", x_train2, y_train2, x_test2, y_test2, \n",
-    "                                   export='D:/thesis/models/sensor2')\n",
+    "    res = train_and_evaluate_model(model, name, \"Sensor B\", x_train2, y_train2, x_test2, y_test2, \n",
+    "                                   export='D:/thesis/models/Sensor B')\n",
     "    results_sensor2.append(res)\n",
-    "    print(f\"{name} on sensor2: Accuracy = {res['accuracy']:.2f}%\")\n",
+    "    print(f\"{name} on Sensor B: Accuracy = {res['accuracy']:.2f}%\")\n",
     "\n",
     "# Display result\n",
     "results_sensor2"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from joblib import load\n",
+    "\n",
+    "model_B: GridSearchCV = load('D:/thesis/models/Sensor B/Grid+SVM+StandardScaler+PCA.joblib')\n",
+    "df_B = model_B.cv_results_\n",
+    "df_B = pd.DataFrame(df_B)\n",
+    "df_B.iloc[np.argmax(df_B['mean_test_score'])]"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -959,6 +2699,15 @@
     "print(latex_table)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_rounded"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -1142,11 +2891,49 @@
    "source": [
     "y_pred_svm"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Retrain whole dataset with best params"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_sensor1 = {\n",
+    "        \"Grid+SVM+StandardScaler+PCA\": GridSearchCV(\n",
+    "                                        Pipeline([\n",
+    "                                                (\"scaler\", StandardScaler()),\n",
+    "                                                (\"pca\", PCA()),\n",
+    "                                                (\"svc\", SVC())\n",
+    "                                                ]),\n",
+    "                                        cv=cv,\n",
+    "                                        param_grid=param_grid,\n",
+    "                                        n_jobs=-1,\n",
+    "                                        scoring='accuracy',\n",
+    "                                        verbose=4\n",
+    "                                    ),\n",
+    "}"
+   ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "thesis",
    "language": "python",
    "name": "python3"
   },