From 9e3e234ef65d84ccfc9a6b0719fe9236b26be0a2 Mon Sep 17 00:00:00 2001 From: nuluh Date: Wed, 15 Oct 2025 13:14:17 +0700 Subject: [PATCH] feat(notebooks): major changes --- code/notebooks/stft.ipynb | 1895 +++++++++++++++++++++++++++++++++++-- 1 file changed, 1841 insertions(+), 54 deletions(-) diff --git a/code/notebooks/stft.ipynb b/code/notebooks/stft.ipynb index 3e661b7..3e29c43 100644 --- a/code/notebooks/stft.ipynb +++ b/code/notebooks/stft.ipynb @@ -370,29 +370,30 @@ "source": [ "#### Undamaged case (d0)\n", "![Overview of the signal preprocessing for undamaged case](attachment:image.png)\n", + "\n", "The figure above shows the overview of the signal preprocessing pipeline for undamaged case $(d_0)$. Notice that undamaged case $(d_0)$ is formed from complementary of each `zzzAD{n}` in damage case $(d_1—d_6)$ as explained in [Generate undamaged case index from complementary pairs for each damaged file of Dataset A](#Generate-undamaged-case-index-from-complementary-pairs-for-each-damaged-file-of-Dataset-A).\n", "\n", "To balance the data for undamaged case $(d_0)$, each produced STFT from $d_0$ ($513 \\times 513$) will only be took its first $22$ timeframes for the first 45 $(\\verb|{test_num}|\\le 45)$ input data and its first $21$ timeframes for the rest of the input data $(\\verb|{test_num}|>45)$. This number was determined by solving linear equation \n", "\n", "$$\n", "\\begin{align*}\n", - "\\frac{2565}{120} = 21.375\n", + "\\frac{2565}{125} = 20.52\n", "\\begin{cases}\n", - "21 & \\text{or} \\\\\n", - "22\n", + "20 & \\text{or} \\\\\n", + "21\n", "\\end{cases}\n", "\\end{align*}\n", "$$\n", "\n", "$$\n", "\\begin{align*}\n", - "21x + 22y &= 2565 \\\\\n", - "x + y &= 120 \\\\\n", + "20x + 21y &= 2565 \\\\\n", + "x + y &= 125 \\\\\n", "\\end{align*}\n", "$$\n", "\n", "$$\n", - "y = 45, x = 75\n", + "x = 60, y = 65\n", "$$\n", "\n", "to achieve same samples number $(2565)$ for all labels $(d_0—d_6)$." @@ -450,7 +451,7 @@ " plt.yticks(np.linspace(0, len(data.columns)-1, y_num_ticks)) # Set y-ticks at regular intervals\n", " plt.rcParams['svg.fonttype'] = 'none'\n", " plt.rcParams['text.usetex'] = False # use mathtext, not external LaTeX\n", - " plt.savefig(\"output_single.svg\", format=\"svg\", dpi=100)\n", + " plt.savefig(\"output_single.svg\", facecolor='none', format=\"svg\", dpi=100)\n", " plt.show()\n", "\n", " elif type(data) == list and len(data) > 1:\n", @@ -483,7 +484,7 @@ " plt.rcParams['svg.fonttype'] = 'none'\n", " plt.rcParams['text.usetex'] = False # use mathtext, not external LaTeX\n", " mpl.rcParams['text.usetex'] = False # use mathtext, not external LaTeX\n", - " plt.savefig(\"output_multiple.svg\", format=\"svg\", dpi=80)\n", + " plt.savefig(\"output_multiple.svg\", facecolor='none', format=\"svg\", dpi=80)\n", " plt.show()" ] }, @@ -607,11 +608,11 @@ "X1a, y = create_ready_data('D:/thesis/data/converted/raw/sensor1')\n", "\n", "# Display DataFrame\n", - "pd.concat([\n", - " pd.DataFrame(y, columns=['label']), # Labels\n", - " X1a, # Features\n", - " ], \n", - " axis=1)" + "# pd.concat([\n", + "# pd.DataFrame(y, columns=['label']), # Labels\n", + "# X1a, # Features\n", + "# ], \n", + "# axis=1)" ] }, { @@ -623,11 +624,11 @@ "X2a, y = create_ready_data('D:/thesis/data/converted/raw/sensor2')\n", "\n", "# Display DataFrame\n", - "pd.concat([\n", - " pd.DataFrame(y, columns=['label']), # Labels\n", - " X2a, # Features\n", - " ], \n", - " axis=1)" + "# pd.concat([\n", + "# pd.DataFrame(y, columns=['label']), # Labels\n", + "# X2a, # Features\n", + "# ], \n", + "# axis=1)" ] }, { @@ -664,25 +665,143 @@ "print(\"Shape of y2_train:\", y_train1.shape)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Scree plot" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "from src.ml.model_selection import train_and_evaluate_model\n", - "from sklearn.svm import SVC\n", - "from sklearn.pipeline import make_pipeline\n", - "from sklearn.preprocessing import StandardScaler\n", - "from sklearn.svm import SVC\n", + "import matplotlib.pyplot as plt\n", "from sklearn.decomposition import PCA\n", - "# from xgboost import XGBClassifier\n", - "# from sklearn.ensemble import RandomForestClassifier, BaggingClassifier\n", - "# from sklearn.tree import DecisionTreeClassifier\n", - "# from sklearn.neighbors import KNeighborsClassifier\n", - "# from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n", - "from sklearn.neural_network import MLPClassifier\n", - "\n" + "from sklearn.preprocessing import StandardScaler\n", + "\n", + "# Assuming X is your dataset\n", + "scaler1 = StandardScaler()\n", + "scaler2 = StandardScaler()\n", + "X_scaled1 = scaler1.fit_transform(x_train1)\n", + "X_scaled2 = scaler2.fit_transform(x_train2)\n", + "\n", + "# Perform PCA\n", + "pca1 = PCA()\n", + "pca1.fit(X_scaled1)\n", + "pca2 = PCA()\n", + "pca2.fit(X_scaled2)\n", + "\n", + "# # Explained variance ratio\n", + "# explained_variance_ratio = pca1.explained_variance_ratio_\n", + "\n", + "# # Create the scree plot\n", + "# plt.figure(figsize=(8, 6))\n", + "# plt.plot(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio, marker='o', linestyle='--')\n", + "# plt.title('Scree Plot')\n", + "# plt.xlabel('Principal Component')\n", + "# plt.ylabel('Explained Variance Ratio')\n", + "# plt.grid()\n", + "# plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "\n", + "# Original data - Cumulative explained variance for first dataset\n", + "cumulative_variance = np.cumsum(pca.explained_variance_ratio_)\n", + "n_components_95 = np.argmax(cumulative_variance >= 0.95) + 1\n", + "n_components_16 = 16\n", + "cumulative_variance_16 = cumulative_variance[n_components_16 - 1]\n", + "\n", + "# Create figure and primary axis\n", + "fig, ax1 = plt.subplots(figsize=(8, 6))\n", + "\n", + "# Plot first dataset on primary axis\n", + "ax1.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o', linestyle='--', \n", + " label='Dataset 1', color='blue')\n", + "ax1.axhline(y=0.95, color='r', linestyle='--', label='95% Variance')\n", + "ax1.axvline(x=n_components_95, color='g', linestyle='--', label=f'n={n_components_95}')\n", + "ax1.axvline(x=n_components_16, color='b', linestyle='--', label=f'n={n_components_16} ({cumulative_variance_16:.2f})')\n", + "ax1.axhline(y=cumulative_variance_16, color='b', linestyle='--')\n", + "\n", + "# Set labels and properties for first dataset\n", + "ax1.set_xlabel('Principal Component')\n", + "ax1.set_ylabel('Dataset 1 Variance Ratio', color='blue')\n", + "ax1.tick_params(axis='y', labelcolor='blue')\n", + "ax1.grid(True, alpha=0.3)\n", + "\n", + "# Create secondary y-axis that shares the same x-axis\n", + "ax2 = ax1.twinx()\n", + "\n", + "# Example second dataset (replace with your actual data)\n", + "# For demonstration, I'm creating synthetic data with a different scale\n", + "second_dataset = np.sqrt(cumulative_variance) # Just an example - replace with your actual data\n", + "\n", + "# Plot second dataset on secondary axis\n", + "ax2.plot(range(1, len(second_dataset) + 1), second_dataset, marker='s', linestyle='-', \n", + " color='red', label='Dataset 2')\n", + "\n", + "# Set properties for second dataset\n", + "ax2.set_ylabel('Dataset 2 Variance Ratio', color='red')\n", + "ax2.tick_params(axis='y', labelcolor='red')\n", + "\n", + "# Create combined legend\n", + "lines1, labels1 = ax1.get_legend_handles_labels()\n", + "lines2, labels2 = ax2.get_legend_handles_labels()\n", + "ax1.legend(lines1 + lines2, labels1 + labels2, loc='best')\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Test the stability of PCA" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.decomposition import PCA\n", + "from sklearn.utils import resample\n", + "import numpy as np\n", + "\n", + "angles = []\n", + "\n", + "for i in range(100): # bootstrap 100 times\n", + " Xb = resample(X1a)\n", + " pca = PCA().fit(Xb)\n", + " v1 = pca.components_[1] # first eigenvector\n", + " if i > 0:\n", + " angle = np.abs(np.dot(v1, prev_v1)) # cosine similarity\n", + " angles.append(angle)\n", + " prev_v1 = v1\n", + "\n", + "np.mean(angles) # close to 1 = stable; lower = unstable" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sklearnex\n", + "sklearnex.patch_sklearn()\n" ] }, { @@ -698,28 +817,448 @@ "metadata": {}, "outputs": [], "source": [ - "# Define models for sensor1\n", - "models_sensor1 = {\n", + "from src.ml.model_selection import train_and_evaluate_model\n", + "from sklearn.svm import SVC\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.svm import SVC\n", + "from sklearn.decomposition import PCA\n", + "# from xgboost import XGBClassifier\n", + "# from sklearn.ensemble import RandomForestClassifier, BaggingClassifier\n", + "# from sklearn.tree import DecisionTreeClassifier\n", + "# from sklearn.neighbors import KNeighborsClassifier\n", + "# from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n", + "# from sklearn.neural_network import MLPClassifier\n", + "\n", + "from sklearn.model_selection import StratifiedKFold, GridSearchCV\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Checking explained variance ratio using PCA" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import os\n", + "\n", + "from sklearn.manifold import TSNE\n", + "def plot_tsne(pca, y, n, save_to):\n", + " tsne = TSNE(n_components=2, perplexity=70, learning_rate=200, random_state=10)\n", + " X_tsne = tsne.fit_transform(pca)\n", + " # set size\n", + " plt.figure(figsize=(8, 6), dpi=300)\n", + " # add color bar\n", + " # remove plot area lines\n", + " plt.gca().spines['top'].set_visible(False)\n", + " plt.gca().spines['right'].set_visible(False)\n", + " plt.gca().spines['bottom'].set_visible(False)\n", + " plt.gca().spines['left'].set_visible(False)\n", + "\n", + " # remove ticks\n", + " plt.xticks([])\n", + " plt.yticks([])\n", + "\n", + " # Example: X_tsne is (n_samples, 2) array; y_train1 is class labels (0, 1, 2, ...)\n", + " # X_tsne, y_train1 = ...\n", + "\n", + " # Choose a color map (tab10 is good for up to 10 classes)\n", + " cmap = plt.get_cmap('tab10')\n", + "\n", + " # Get the unique class labels\n", + " classes = np.unique(y)\n", + "\n", + " # Create the scatter plot, one class at a time\n", + " for i, cls in enumerate(classes):\n", + " plt.scatter(\n", + " X_tsne[y == cls, 0],\n", + " X_tsne[y == cls, 1],\n", + " color=cmap(i),\n", + " label=str(cls),\n", + " alpha=0.6\n", + " )\n", + "\n", + " # Add title and legend\n", + " # plt.title(\"t-SNE visualization with class labels\")\n", + " plt.legend(title=\"Classes\")\n", + " if save_to:\n", + " if not os.path.exists(save_to):\n", + " os.makedirs(save_to)\n", + " plt.savefig(f\"{save_to}/tsne_pca{n}.png\", dpi=300)\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import pacmap\n", + "import os\n", + "\n", + "def plot_pacmap(pca, y, n, save_to):\n", + " embedding = pacmap.PaCMAP(n_components=2, n_neighbors=20, MN_ratio=1, FP_ratio=2.0, random_state=10)\n", + " X_pacmap = embedding.fit_transform(pca)\n", + "\n", + " plt.figure(figsize=(8, 6), dpi=300)\n", + " # Choose a color map (tab10 is good for up to 10 classes)\n", + " cmap = plt.get_cmap('tab10')\n", + "\n", + " # Get the unique class labels\n", + " classes = np.unique(y)\n", + "\n", + " # Create the scatter plot, one class at a time\n", + " for i, cls in enumerate(classes):\n", + " plt.scatter(\n", + " X_pacmap[y == cls, 0],\n", + " X_pacmap[y == cls, 1],\n", + " color=cmap(i),\n", + " label=str(cls),\n", + " alpha=0.6\n", + " )\n", + " # legend\n", + " plt.gca().spines['top'].set_visible(False)\n", + " plt.gca().spines['right'].set_visible(False)\n", + " plt.gca().spines['bottom'].set_visible(False)\n", + " plt.gca().spines['left'].set_visible(False)\n", + " # remove ticks\n", + " plt.xticks([])\n", + " plt.yticks([])\n", + " plt.tight_layout()\n", + " plt.legend()\n", + " if save_to:\n", + " if not os.path.exists(save_to):\n", + " os.makedirs(save_to)\n", + " plt.savefig(f\"{save_to}/pacmap_pca{n}.png\", dpi=300)\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "\n", + "scaler = StandardScaler()\n", + "X_scaled = scaler.fit_transform(x_train2) \n", + "for n in [512]:\n", + " # pca = PCA(n_components=n).fit_transform(X_scaled)\n", + " pca = X_scaled\n", + " plot_tsne(pca, y_train2, n, save_to=\"D:/thesis/figures/Sensor B\")\n", + " plot_pacmap(pca, y_train2, n, save_to=\"D:/thesis/figures/Sensor B\")\n", + "# plt.plot(np.cumsum(pca.explained_variance_ratio_))\n", + "# plt.xlabel('number of components')\n", + "# plt.ylabel('cumulative explained variance')\n", + "# # set y ticks with step 0.05\n", + "# plt.yticks(np.arange(0, 1.05, 0.05))\n", + "# plt.grid()\n", + "# # show number of components where cumulative explained variance = 0.95\n", + "# n_95 = np.argmax(np.cumsum(pca.explained_variance_ratio_) >= 0.95) + 1\n", + "# plt.axvline(n_95, color='r', linestyle='--')\n", + "# plt.text(n_95+2, 0.5, f'n={n_95}', color='r')\n", + "# plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pca = PCA(n_components=0.95).fit(X_scaled)\n", + "pca.components_" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "loadings = pd.DataFrame(pca.components_.T,\n", + " columns=[f\"PC{i+1}\" for i in range(pca.n_components_)],\n", + " index=X1a.columns)\n", + "loadings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pca.explained_variance_ratio_[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "top = loadings.iloc[:, 0].abs().nlargest(5)\n", + "loadings.loc[top.index, :].abs()*loadings.loc[top.index, :].apply(np.sign)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# show only label 0 of the tsne plot\n", + "scatter = plt.scatter(X_tsne[y_train1 == 0, 0], X_tsne[y_train1 == 0, 1], c='red', label='Label 0')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# show only the one with y == 0\n", + "# use tab10[0] color\n", + "# make it blend with add filter\n", + "\n", + "from sklearn.cluster import DBSCAN\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.patheffects as patheffects\n", + "import matplotlib.patches as mpatches\n", + "\n", + "def cluster_per_label(embedding, labels, targets, *, eps=0.8, min_samples=30):\n", + " summary = {}\n", + " plt.figure(figsize=(8, 5))\n", + " cmap = plt.get_cmap(\"tab10\")\n", + " legend_handles = []\n", + " for target, _ in targets:\n", + " subset = embedding[labels == target]\n", + " if subset.size == 0:\n", + " summary[target] = {}\n", + " continue\n", + "\n", + " model = DBSCAN(eps=eps, min_samples=min_samples).fit(subset)\n", + " cluster_ids = model.labels_\n", + "\n", + " valid_clusters = [cid for cid in np.unique(cluster_ids) if cid != -1]\n", + " counts = {cid: int(np.sum(cluster_ids == cid)) for cid in valid_clusters}\n", + " summary[target] = counts\n", + "\n", + " label_color = cmap(target % cmap.N)\n", + "\n", + " colors = np.array([\n", + " label_color if cid != -1 else (0.8, 0.8, 0.8, 0.3)\n", + " for cid in cluster_ids\n", + " ])\n", + "\n", + " scatter = plt.scatter(\n", + " subset[:, 0],\n", + " subset[:, 1],\n", + " c=colors,\n", + " alpha=0.01\n", + " )\n", + " legend_handles.append(\n", + " mpatches.Patch(color=label_color, label=f\"Damage {target} (clusters={len(valid_clusters)})\")\n", + " )\n", + " for cid in valid_clusters:\n", + " cluster_pts = subset[cluster_ids == cid]\n", + " center = cluster_pts.mean(axis=0)\n", + " txt = plt.text(\n", + " center[0],\n", + " center[1],\n", + " f\"{counts[cid]}\",\n", + " color=label_color,\n", + " fontsize=9,\n", + " fontweight=\"bold\",\n", + " ha=\"left\",\n", + " va=\"bottom\",\n", + " )\n", + " txt.set_path_effects([\n", + " patheffects.withStroke(linewidth=1.8, foreground=\"white\")\n", + " ])\n", + " plt.xlabel(\"PaCMAP-1\")\n", + " plt.ylabel(\"PaCMAP-2\")\n", + " plt.legend(handles=legend_handles, bbox_to_anchor=(1.05, 1), loc='upper left')\n", + " plt.tight_layout()\n", + " plt.figure(dpi=600)\n", + " # move legends outside the plot\n", + " plt.show()\n", + " return summary\n", + "\n", + "cluster_counts = cluster_per_label(\n", + " X_pacmap,\n", + " y_train1,\n", + " targets=[(0,0),(1,1), (2,2), (3,3), (4,4), (5,5), (6,6)],\n", + " eps=0.8,\n", + " min_samples=30,\n", + ")\n", + "\n", + "for label_id, counts in cluster_counts.items():\n", + " print(f\"Damage {label_id}\")\n", + " if counts:\n", + " for cluster_id, n_points in counts.items():\n", + " print(f\" Cluster {cluster_id}: {n_points} points\")\n", + " else:\n", + " print(\" No clusters detected (all noise)\")\n", + "\n", + "print(cluster_counts)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "scaler = StandardScaler()\n", + "X_scaled = scaler.fit_transform(X1a) \n", + "\n", + "pca = PCA(n_components=50).fit_transform(X_scaled)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "embedding = pacmap.PaCMAP(n_components=2, n_neighbors=10, MN_ratio=0.5, FP_ratio=2.0, random_state=10)\n", + "X_pacmap = embedding.fit_transform(pca)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "from sklearn.cluster import KMeans\n", + "import numpy as np\n", + "from collections import Counter\n", + "\n", + "# Suppose X_embedded is your 2D PaCMAP / t-SNE / UMAP embedding\n", + "kmeans = KMeans(n_clusters=40, random_state=42) # choose k\n", + "clusters = kmeans.fit_predict(X_pacmap)\n", + "\n", + "counts = Counter(clusters)\n", + "print(counts)\n", + "\n", + "plt.figure(figsize=(10,8))\n", + "sc = plt.scatter(X_pacmap[:, 0], X_pacmap[:, 1], c=clusters, cmap=\"tab20\", s=10)\n", + "cbar = plt.colorbar(sc, ticks=range(kmeans.n_clusters))\n", + "# Label clusters with counts at their centroids\n", + "centers = kmeans.cluster_centers_\n", + "for i, (x, y) in enumerate(centers):\n", + " plt.text(x, y, str(counts[i]), fontsize=12, weight='bold',\n", + " ha='left', va='bottom', color='black',\n", + " bbox=dict(facecolor='white', alpha=0, edgecolor='none'))\n", + "\n", + "plt.title(\"Clusters with counts\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Add Parameters Grid" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Coarse Grid Search" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "param_grid = [\n", + " { \n", + " \"svc\": [SVC()],\n", + " \"svc__kernel\": [\"rbf\"],\n", + " \"svc__C\": np.exp2(np.arange(-5, 16, 5)),\n", + " \"svc__gamma\": np.exp2(np.arange(-15, 6, 5)),\n", + " \"pca__n_components\": [512, 256, 128, 64, 32, 16, 8, 4]\n", + " },\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Define models for sensor\n", + "models_sensor = {\n", " # \"Random Forest\": RandomForestClassifier(),\n", " # \"Bagged Trees\": BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=10),\n", " # \"Decision Tree\": DecisionTreeClassifier(),\n", " # \"KNN\": KNeighborsClassifier(),\n", " # \"LDA\": LinearDiscriminantAnalysis(),\n", " # \"SVM\": make_pipeline(\n", - " # StandardScaler(),\n", - " # SVC(kernel='rbf', probability=True)\n", + " # SVC(kernel='rbf')\n", " # ),\n", - " \"SVM with StandardScaler and PCA\": make_pipeline(\n", - " StandardScaler(),\n", - " PCA(n_components=10),\n", - " SVC(kernel='rbf', probability=True)\n", - " ),\n", + " # \"Grid SVM\": GridSearchCV(\n", + " # Pipeline([\n", + " # (\"scaler\", StandardScaler()),\n", + " # (\"svc\", SVC())\n", + " # ]),\n", + " # cv=cv,\n", + " # param_grid=param_grid,\n", + " # n_jobs=-1,\n", + " # scoring='accuracy',\n", + " # verbose=2\n", + " # ),\n", + "\n", + " \"Grid+SVM+StandardScaler+PCA\": GridSearchCV(\n", + " Pipeline([\n", + " (\"scaler\", StandardScaler()),\n", + " (\"pca\", PCA()),\n", + " (\"svc\", SVC())\n", + " ]),\n", + " cv=cv,\n", + " param_grid=param_grid,\n", + " n_jobs=-1,\n", + " scoring='accuracy',\n", + " verbose=4\n", + " ),\n", "\n", " # \"XGBoost\": XGBClassifier()\n", - " \"MLPClassifier\": make_pipeline(\n", - " StandardScaler(),\n", - " MLPClassifier(hidden_layer_sizes=(1, 10), max_iter=500, random_state=42)\n", - " )\n", + " # \"MLPClassifier\": make_pipeline(\n", + " # StandardScaler(),\n", + " # MLPClassifier(hidden_layer_sizes=(1, 10), max_iter=500, random_state=42)\n", + " # )\n", "}" ] }, @@ -730,7 +1269,7 @@ "outputs": [], "source": [ "results_sensor1 = []\n", - "for name, model in models_sensor1.items():\n", + "for name, model in models_sensor.items():\n", " res = train_and_evaluate_model(model, name, \"Sensor A\", x_train1, y_train1, x_test1, y_test1, \n", " export='D:/thesis/models/Sensor A')\n", " results_sensor1.append(res)\n", @@ -740,6 +1279,1187 @@ "results_sensor1" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results_sensor2 = []\n", + "for name, model in models_sensor.items():\n", + " res = train_and_evaluate_model(model, name, \"Sensor B\", x_train2, y_train2, x_test2, y_test2, \n", + " export='D:/thesis/models/Sensor B')\n", + " results_sensor2.append(res)\n", + " print(f\"{name} on sensor2: Accuracy = {res['accuracy']:.2f}%\")\n", + "\n", + "# Display result\n", + "results_sensor2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from joblib import load\n", + "from sklearn.model_selection import GridSearchCV\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "model: GridSearchCV = load('D:/thesis/models/Sensor B/Grid+SVM+StandardScaler+PCA.joblib')\n", + "df = model.cv_results_\n", + "df = pd.DataFrame(df)\n", + "df\n", + "# change column \"param_svc__C\" to np.log2\n", + "# df['param_svc__C'] = np.log2(df['param_svc__C'])\n", + "# df['param_svc__gamma'] = np.log2(df['param_svc__gamma'])\n", + "\n", + "# for i in df['param_pca__n_components'].unique():\n", + "# # show the other columns where mean_test_score is max\n", + "# idx = df[df['param_pca__n_components'] == i]['mean_test_score'].idxmax()\n", + "\t\n", + "# print(i, \":\", df.iloc[idx][\"param_svc__C\"], df.iloc[idx][\"param_svc__gamma\"], df.iloc[idx][\"mean_test_score\"], df.iloc[idx][\"mean_fit_time\"], df.iloc[idx][\"mean_score_time\"])\n", + "\n", + "# Get rows where param_pca__n_components is 32\n", + "# result = df[df['param_pca__n_components'] == 32]\n", + "# top 10 most fit time\n", + "result.nlargest(10, 'mean_test_score')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Heatmap" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# plot contour plot of mean_test_score with x=param_svc__C and y=param_svc__gamma for each param_pca__n_components\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import seaborn as sns\n", + "norm = plt.Normalize(vmin=0, vmax=1)\n", + "for i in df['param_pca__n_components'].unique():\n", + " subset = df[df['param_pca__n_components'] == i]\n", + " pivot_table = subset.pivot(index='param_svc__gamma', columns='param_svc__C', values='mean_test_score')\n", + " plt.figure(figsize=(8, 6), dpi=300)\n", + " sns.heatmap(pivot_table, annot=True, fmt=\".4f\", cmap='RdYlGn', norm=norm)\n", + " plt.xlabel('C (log2 scale)')\n", + " plt.ylabel('Gamma (log2 scale)')\n", + " plt.xticks(ticks=np.arange(len(pivot_table.columns)) + 0.5, labels=[f\"{np.log2(c):.0f}\" for c in pivot_table.columns])\n", + " plt.yticks(ticks=np.arange(len(pivot_table.index)) + 0.5, labels=[f\"{np.log2(g):.0f}\" for g in pivot_table.index])\n", + "\n", + " # Add outline for the heatmap cells\n", + " for (j, k), val in np.ndenumerate(pivot_table.values):\n", + " plt.gca().add_patch(plt.Rectangle((j, k), 1, 1, fill=False, edgecolor='black', lw=0.5))\n", + "\n", + " # Highlight the cell with the max mean_test_score\n", + " max_idx = subset['mean_test_score'].idxmax()\n", + " max_C = subset.loc[max_idx, 'param_svc__C']\n", + " max_gamma = subset.loc[max_idx, 'param_svc__gamma']\n", + " max_x = np.where(pivot_table.columns == max_C)[0][0] + 0.5\n", + " max_y = np.where(pivot_table.index == max_gamma)[0][0] + 0.5\n", + " plt.gca().add_patch(plt.Rectangle((max_x-0.5, max_y-0.5), 1, 1, fill=False, edgecolor='red', lw=4))\n", + "\n", + " # Save the figure for the current PCA component\n", + " print(i)\n", + " plt.savefig(f'D:/thesis/figures/Sensor B/grid_pca{i}.png', dpi=300)\n", + " print(f\"Saved figure: D:/thesis/figures/Sensor B/grid_pca{i}.png\")\n", + " plt.close() # Close the figure to avoid overlapping figures in the next iteration" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Efficiency" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# make param c and gamma to be log2 scale\n", + "df['param_svc__C'] = np.log2(df['param_svc__C']).astype(int)\n", + "df['param_svc__gamma'] = np.log2(df['param_svc__gamma']).astype(int)\n", + "# daata frame to include only highest mean_test_score for each param_pca__n_components\n", + "best_results = df.loc[df.groupby('param_pca__n_components')['mean_test_score'].idxmax()][['param_pca__n_components', 'param_svc__C', 'param_svc__gamma', 'mean_test_score', 'mean_fit_time']]\n", + "# add ratio of mean test score to mean fit time return in 10^-3\n", + "best_results['time_score_ratio'] = best_results['mean_test_score'] / best_results['mean_fit_time'] * 1e3\n", + "print(best_results.to_latex(float_format=\"%.5f\", index=False))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sns.lineplot(x=\"param_svc__C\", y=\"mean_test_score\", hue=\"param_svc__kernel\", data=df, marker=\"o\")\n", + "plt.xscale(\"log\", base=2)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df[\"max_diff\"] = df[[\"split0_test_score\",\"split1_test_score\",\n", + " \"split2_test_score\",\"split3_test_score\",\"split4_test_score\"]].max(axis=1) - \\\n", + " df[[\"split0_test_score\",\"split1_test_score\",\n", + " \"split2_test_score\",\"split3_test_score\",\"split4_test_score\"]].min(axis=1)\n", + "\n", + "sns.scatterplot(x=\"mean_test_score\", y=\"max_diff\", hue=\"param_svc__kernel\", data=df)\n", + "plt.title(\"Stability vs Score\")\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "# ---------------------------\n", + "# Prep\n", + "# ---------------------------\n", + "def _prep(cv_df: pd.DataFrame):\n", + " df = cv_df.copy()\n", + " df = df.rename(columns={\n", + " \"param_pca__n_components\": \"n\",\n", + " \"param_svc__kernel\": \"kernel\",\n", + " \"param_svc__C\": \"C\",\n", + " \"param_svc__gamma\": \"gamma\",\n", + " \"mean_test_score\": \"mean\",\n", + " \"std_test_score\": \"std\",\n", + " \"mean_fit_time\": \"fit_time\",\n", + " \"mean_score_time\": \"score_time\"\n", + " })\n", + " # Numeric coercion (robust to object dtypes)\n", + " for col in [\"C\", \"gamma\", \"mean\", \"std\", \"fit_time\", \"score_time\", \"n\"]:\n", + " if col in df.columns:\n", + " df[col] = pd.to_numeric(df[col], errors=\"coerce\")\n", + " # Count CV splits from split* columns\n", + " split_cols = [c for c in df.columns if c.startswith(\"split\") and c.endswith(\"_test_score\")]\n", + " n_splits = max(1, len(split_cols))\n", + " # Derived metrics\n", + " df[\"se\"] = df[\"std\"] / np.sqrt(n_splits)\n", + " df[\"total_time\"] = df[\"fit_time\"] + df[\"score_time\"]\n", + " df[\"lcb\"] = df[\"mean\"] - 1.96 * df[\"se\"] # 95% lower confidence bound\n", + " return df, n_splits\n", + "\n", + "# ---------------------------\n", + "# Pareto mask (maximize 'score_col', minimize 'time_col')\n", + "# ---------------------------\n", + "def _pareto_mask(d: pd.DataFrame, score_col=\"lcb\", time_col=\"total_time\") -> pd.Series:\n", + " # A point is NON-dominated if no other point is <= time and >= score\n", + " pts = d[[time_col, score_col]].to_numpy()\n", + " n = len(d)\n", + " is_nd = np.ones(n, dtype=bool)\n", + " for i in range(n):\n", + " if not is_nd[i]:\n", + " continue\n", + " t_i, s_i = pts[i]\n", + " dominated_by_any = (\n", + " ((pts[:,0] <= t_i) & (pts[:,1] >= s_i)) &\n", + " ((pts[:,0] < t_i) | (pts[:,1] > s_i))\n", + " )\n", + " dominated_by_any[i] = False # ignore self\n", + " if np.any(dominated_by_any):\n", + " is_nd[i] = False\n", + " return pd.Series(is_nd, index=d.index)\n", + "\n", + "# ---------------------------\n", + "# Boundary (edge-of-grid) check\n", + "# ---------------------------\n", + "def _edge_flags(row, uniques):\n", + " def edge(val, arr):\n", + " arr = np.asarray(sorted(x for x in arr if pd.notnull(x)))\n", + " if len(arr) == 0 or pd.isnull(val):\n", + " return False\n", + " return np.isclose(val, arr.min()) or np.isclose(val, arr.max())\n", + " return {\n", + " \"edge_C\": edge(row[\"C\"], uniques.get(\"C\", [])),\n", + " \"edge_gamma\": edge(row[\"gamma\"], uniques.get(\"gamma\", [])),\n", + " \"edge_n\": edge(row[\"n\"], uniques.get(\"n\", [])),\n", + " }\n", + "\n", + "# ---------------------------\n", + "# Main analyzer\n", + "# ---------------------------\n", + "def analyze_efficiency(\n", + " cv_df: pd.DataFrame,\n", + " w_fit: float = 1.0,\n", + " w_score: float = 1.0,\n", + " use_lcb: bool = True\n", + "):\n", + " \"\"\"\n", + " Returns:\n", + " summary_by_n: DataFrame (one picked row per n with diagnostics)\n", + " pareto_by_n: dict[n] -> DataFrame of Pareto frontier candidates\n", + " picks_by_n: dict[n] -> Series (the picked row)\n", + " global_reco: Series (best overall by efficiency)\n", + " \"\"\"\n", + " df, n_splits = _prep(cv_df)\n", + "\n", + " # Allow custom weighting of fit vs score time (e.g., inference matters more)\n", + " df[\"total_time\"] = w_fit * df[\"fit_time\"] + w_score * df[\"score_time\"]\n", + " score_col = \"lcb\" if use_lcb else \"mean\"\n", + "\n", + " # Cache unique grids for edge-of-grid flags\n", + " uniques = {\n", + " \"C\": df[\"C\"].unique(),\n", + " \"gamma\": df[\"gamma\"].unique(),\n", + " \"n\": df[\"n\"].unique()\n", + " }\n", + "\n", + " pareto_by_n, picks_by_n, rows = {}, {}, []\n", + "\n", + " for n, g in df.groupby(\"n\"):\n", + " g = g.dropna(subset=[\"total_time\", score_col]).copy()\n", + " if g.empty:\n", + " continue\n", + "\n", + " # Pareto frontier within this n\n", + " mask = _pareto_mask(g, score_col=score_col, time_col=\"total_time\")\n", + " pareto = g.loc[mask].copy()\n", + " pareto[\"efficiency\"] = pareto[score_col] / pareto[\"total_time\"]\n", + "\n", + " # Choose the efficiency champion at this n\n", + " pick = pareto.loc[pareto[\"efficiency\"].idxmax()].copy()\n", + "\n", + " # Compare versus best-accuracy-at-n (not risk-adjusted)\n", + " best_acc_row = g.loc[g[\"mean\"].idxmax()].copy()\n", + " acc_loss_vs_best = best_acc_row[\"mean\"] - pick[\"mean\"] # >= 0 means pick is slightly worse in accuracy\n", + " speedup_vs_best = best_acc_row[\"total_time\"] / pick[\"total_time\"] # >1 means pick is faster\n", + "\n", + " # Edge-of-grid diagnostics\n", + " flags = _edge_flags(pick, uniques)\n", + "\n", + " # Pack summary row\n", + " rows.append({\n", + " \"n\": n,\n", + " \"kernel\": pick.get(\"kernel\"),\n", + " \"C\": pick.get(\"C\"),\n", + " \"gamma\": pick.get(\"gamma\"),\n", + " \"mean_score\": pick[\"mean\"],\n", + " \"std_score\": pick[\"std\"],\n", + " \"se_score\": pick[\"se\"],\n", + " \"LCB_score\": pick[\"lcb\"],\n", + " \"fit_time_s\": pick[\"fit_time\"],\n", + " \"score_time_s\": pick[\"score_time\"],\n", + " \"total_time_s\": pick[\"total_time\"],\n", + " \"efficiency\": pick[\"efficiency\"], # (LCB or mean)/sec\n", + " \"acc_loss_vs_best_at_n\": acc_loss_vs_best,\n", + " \"speedup_vs_best_at_n\": speedup_vs_best,\n", + " \"pareto_size_at_n\": len(pareto),\n", + " \"edge_C\": flags[\"edge_C\"],\n", + " \"edge_gamma\": flags[\"edge_gamma\"],\n", + " \"edge_n\": flags[\"edge_n\"],\n", + " })\n", + "\n", + " pareto_by_n[n] = (pareto\n", + " .sort_values([\"total_time\", score_col], ascending=[True, False])\n", + " .reset_index(drop=True))\n", + " picks_by_n[n] = pick\n", + "\n", + " summary_by_n = pd.DataFrame(rows).sort_values(\"efficiency\", ascending=False).reset_index(drop=True)\n", + "\n", + " # Best overall by efficiency\n", + " global_reco = None\n", + " if not summary_by_n.empty:\n", + " global_reco = summary_by_n.loc[summary_by_n[\"efficiency\"].idxmax()]\n", + "\n", + " return summary_by_n, pareto_by_n, picks_by_n, global_reco\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# summary_by_n: one recommended (most efficient) config per PCA n_components\n", + "# pareto_by_n[n]: the Pareto frontier at that n (all strong time/accuracy trade-offs)\n", + "summary_by_n, pareto_by_n, picks_by_n, global_reco = analyze_efficiency(df)\n", + "\n", + "print(\"Top efficiency picks per n_components:\")\n", + "display(summary_by_n.head(20)) # or print(summary_by_n.to_string(index=False))\n", + "\n", + "print(\"\\nMost efficient configuration overall:\")\n", + "print(global_reco.to_string())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "def plot_time_vs_score_for_n(cv_df, n, use_lcb=True):\n", + " df, _ = _prep(cv_df)\n", + " score_col = \"lcb\" if use_lcb else \"mean\"\n", + " df[\"total_time\"] = df[\"fit_time\"] + df[\"score_time\"]\n", + "\n", + " g = df[df[\"n\"] == n].dropna(subset=[\"total_time\", score_col]).copy()\n", + " if g.empty:\n", + " print(f\"No rows for n={n}\")\n", + " return\n", + "\n", + " mask = _pareto_mask(g, score_col=score_col, time_col=\"total_time\")\n", + " pareto = g.loc[mask].copy()\n", + "\n", + " plt.figure(figsize=(7, 4.5))\n", + " plt.scatter(g[\"total_time\"], g[score_col], alpha=0.3, s=16)\n", + " p_sorted = pareto.sort_values([\"total_time\", score_col], ascending=[True, False])\n", + " plt.plot(p_sorted[\"total_time\"], p_sorted[score_col], marker=\"o\")\n", + " plt.xlabel(\"Total time (fit + score, s)\")\n", + " plt.ylabel(\"CV score\" + (\" (LCB)\" if use_lcb else \" (mean)\"))\n", + " plt.title(f\"Time vs Score with Pareto frontier — n={n}\")\n", + " plt.grid(True, alpha=0.3)\n", + " plt.tight_layout()\n", + " plt.show()\n", + "\n", + "# Example usage:\n", + "plot_time_vs_score_for_n(df, n=128, use_lcb=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(df.iloc[np.argmax(df['mean_fit_time'])]['params']) # Check the highest mean fit time" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import matplotlib as mpl\n", + "\n", + "# Pivot table for contour plot with log2 transformation\n", + "pivot = df.pivot(\n", + " index='param_svc__C', \n", + " columns='param_svc__gamma', \n", + " values=\"mean_test_score\"\n", + ")\n", + "\n", + "# Create new log2-transformed indices and columns\n", + "log2_columns = np.log2(pivot.columns)\n", + "log2_indices = np.log2(pivot.index)\n", + "\n", + "\n", + "# Create a contour plot using log2-transformed data\n", + "plt.figure(figsize=(8, 6))\n", + "X, Y = np.meshgrid(log2_columns, log2_indices)\n", + "Z = pivot.values\n", + "\n", + "levels = np.linspace(Z.min(), Z.max(), 10) # Adjust the number of levels as needed\n", + "levels = [0.9, 0.92, 0.96, 0.98, 0.99]\n", + "# Create filled contours\n", + "# contour = plt.contourf(X, Y, Z, levels=levels, cmap=\"viridis\")\n", + "# plt.colorbar(contour, label=\"Mean Test Score (Accuracy)\")\n", + "\n", + "# Add contour lines\n", + "contour_lines = plt.contour(X, Y, Z, levels=levels, cmap='Dark2', linewidths=0.5)\n", + "plt.clabel(contour_lines, inline=True, fontsize=8, fmt=\"%.4f\")\n", + "\n", + "# Set axis labels and title\n", + "plt.title(\"GridSearchCV Mean Test Score (Accuracy)\")\n", + "plt.xlabel(\"log₂(gamma)\")\n", + "plt.ylabel(\"log₂(C)\")\n", + "\n", + "# Since we're already using log2 values, no need to transform tick labels\n", + "plt.xticks(X[0], [f\"{x:.1f}\" for x in X[0]])\n", + "plt.yticks(Y[:,0], [f\"{y:.1f}\" for y in Y[:,0]])\n", + "\n", + "print(plt.gca().get_yticklabels())\n", + "print(plt.gca().get_xticklabels())\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Evaluation Baseline on Dataset B" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from src.ml.model_selection import create_ready_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X2a, y = create_ready_data('D:/thesis/data/converted/raw/sensor2')\n", + "\n", + "# Display DataFrame\n", + "# pd.concat([\n", + "# pd.DataFrame(y, columns=['label']), # Labels\n", + "# X2a, # Features\n", + "# ], \n", + "# axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# train svm with best params from gridsearchcv\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.svm import SVC\n", + "from sklearn.decomposition import PCA\n", + "from sklearn.metrics import classification_report, confusion_matrix\n", + "from joblib import dump, load\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearnex import patch_sklearn\n", + "patch_sklearn()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Retrain the model on the entire dataset\n", + "final_model_baseline = Pipeline([\n", + " (\"scaler\", StandardScaler()),\n", + " (\"pca\", PCA(n_components=32)),\n", + " (\"svc\", SVC(C=2**10, gamma=2**-10, kernel='rbf'))\n", + "])\n", + "\n", + "# Fit the model on the entire dataset\n", + "final_model_baseline.fit(X1a, y)\n", + "\n", + "# Save the final model\n", + "from joblib import dump\n", + "dump(final_model_baseline, \"D:/thesis/models/Sensor A/baseline_pca32_c10_g-10.joblib\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X1b, y1b = create_ready_data('D:/thesis/data/converted/raw_B/sensor1')\n", + "X2b, y2b = create_ready_data('D:/thesis/data/converted/raw_B/sensor2')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# svm_model = load('D:/thesis/models/sensor2/SVM.joblib')\n", + "model1a = load(\"D:/thesis/models/Sensor A/baseline_pca32_c10_g-10.joblib\")\n", + "y_pred_model1a = model1a.predict(X1b)\n", + "model2a = load(\"D:/thesis/models/Sensor B/baseline_pca16_c5_g-5.joblib\")\n", + "y_pred_model2a = model2a.predict(X2b)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Classification Report for Sensor A Baseline Model:\")\n", + "print(pd.DataFrame(classification_report(y2b, y_pred_model2a, output_dict=True)).transpose().to_latex(index=True, float_format=\"%.2f\"))\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import seaborn as sns\n", + "from sklearn.metrics import confusion_matrix\n", + "cm = confusion_matrix(y2b, y_pred_model2a, labels=[0,1,2,3,4,5,6])\n", + "plt.figure(figsize=(8, 6), dpi=300)\n", + "sns.heatmap(cm, annot=True, fmt=\"d\", cmap=\"Blues\", cbar=False)\n", + "plt.xlabel(\"Predicted Label\")\n", + "plt.ylabel(\"True Label\")\n", + "# plt.title(\"Confusion Matrix for Sensor A Baseline Model\")\n", + "# add cells outline\n", + "for i in range(cm.shape[0]):\n", + " for j in range(cm.shape[1]):\n", + " plt.gca().add_patch(plt.Rectangle((j, i), 1, 1, fill=False, edgecolor='black', lw=0.5))\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Fine Grid Search" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "param_grid = [\n", + " { \n", + " \"svc\": [SVC()],\n", + " \"svc__kernel\": [\"rbf\"],\n", + " \"svc__C\": np.logspace(3, 7, 9, base=2),\n", + " \"svc__gamma\": np.logspace(-7, -3, 9, base=2)\n", + " },\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sklearnex\n", + "sklearnex.patch_sklearn()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "models_sensor = {\n", + " # \"Random Forest\": RandomForestClassifier(),\n", + " # \"Bagged Trees\": BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=10),\n", + " # \"Decision Tree\": DecisionTreeClassifier(),\n", + " # \"KNN\": KNeighborsClassifier(),\n", + " # \"LDA\": LinearDiscriminantAnalysis(),\n", + " # \"SVM\": make_pipeline(\n", + " # SVC(kernel='rbf')\n", + " # ),\n", + " # \"Grid SVM\": GridSearchCV(\n", + " # Pipeline([\n", + " # (\"scaler\", StandardScaler()),\n", + " # (\"svc\", SVC())\n", + " # ]),\n", + " # cv=cv,\n", + " # param_grid=param_grid,\n", + " # n_jobs=-1,\n", + " # scoring='accuracy',\n", + " # verbose=2\n", + " # ),\n", + "\n", + " \"FineGrid+SVM+StandardScaler+PCA\": GridSearchCV(\n", + " Pipeline([\n", + " (\"scaler\", StandardScaler()),\n", + " (\"pca\", PCA(n_components=16)),\n", + " (\"svc\", SVC())\n", + " ]),\n", + " cv=cv,\n", + " param_grid=param_grid,\n", + " n_jobs=-1,\n", + " scoring='accuracy',\n", + " verbose=4\n", + " ),\n", + "\n", + " # \"XGBoost\": XGBClassifier()\n", + " # \"MLPClassifier\": make_pipeline(\n", + " # StandardScaler(),\n", + " # MLPClassifier(hidden_layer_sizes=(1, 10), max_iter=500, random_state=42)\n", + " # )\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results_sensor2 = []\n", + "for name, model in models_sensor.items():\n", + " res = train_and_evaluate_model(model, name, \"Sensor B\", x_train2, y_train2, x_test2, y_test2, \n", + " export='D:/thesis/models/Sensor B')\n", + " results_sensor2.append(res)\n", + " print(f\"{name} on sensor2: Accuracy = {res['accuracy']:.2f}%\")\n", + "\n", + "# Display result\n", + "results_sensor2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from joblib import load\n", + "\n", + "model: GridSearchCV = load('D:/thesis/models/Sensor A/FineGrid+SVM+StandardScaler+PCA.joblib')\n", + "df = model.cv_results_\n", + "df = pd.DataFrame(df)\n", + "df['param_pca__n_components'] = 32\n", + "# df['param_pca__n_components'].unique()\n", + "# df['param_svc__C'] = np.log2(df['param_svc__C']).astype(int)\n", + "# df['param_svc__gamma'] = np.log2(df['param_svc__gamma']).astype(int)\n", + "# df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "# turn param_svc__C and param_svc__gamma to log2 scale\n", + "\n", + "# df.iloc[np.argmax(df['mean_test_score'])]\n", + "df.nlargest(10, 'mean_test_score')\n", + "\n", + "# add best c and gamma for sensor B" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import matplotlib as mpl\n", + "\n", + "# Pivot table for contour plot with log2 transformation\n", + "pivot = df.pivot(\n", + " index='param_svc__C', \n", + " columns='param_svc__gamma', \n", + " values=\"mean_test_score\"\n", + ")\n", + "\n", + "# Create new log2-transformed indices and columns\n", + "log2_columns = np.log2(pivot.columns)\n", + "log2_indices = np.log2(pivot.index)\n", + "\n", + "\n", + "# Create a contour plot using log2-transformed data\n", + "plt.figure(figsize=(8, 6))\n", + "X, Y = np.meshgrid(log2_columns, log2_indices)\n", + "Z = pivot.values\n", + "\n", + "levels = np.linspace(0.6, Z.max(), 200) # Adjust the number of levels as needed\n", + "levels = [0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.978, 0.979, 0.9792]\n", + "# Create filled contours\n", + "# contour = plt.contourf(X, Y, Z, levels=levels, cmap=\"viridis\")\n", + "# plt.colorbar(contour, label=\"Mean Test Score (Accuracy)\")\n", + "\n", + "# Add contour lines\n", + "contour_lines = plt.contour(X, Y, Z, levels=levels, cmap='Dark2', linewidths=0.5)\n", + "plt.clabel(contour_lines, inline=True, fontsize=8, fmt=\"%.4f\")\n", + "\n", + "# Set axis labels and title\n", + "plt.title(\"GridSearchCV Mean Test Score (Accuracy)\")\n", + "plt.xlabel(\"log₂(gamma)\")\n", + "plt.ylabel(\"log₂(C)\")\n", + "\n", + "# Since we're already using log2 values, no need to transform tick labels\n", + "plt.xticks(X[0], [f\"{x:.1f}\" for x in X[0]])\n", + "plt.yticks(Y[:,0], [f\"{y:.1f}\" for y in Y[:,0]])\n", + "\n", + "print(plt.gca().get_yticklabels())\n", + "print(plt.gca().get_xticklabels())\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Heatmap" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# plot contour plot of mean_test_score with x=param_svc__C and y=param_svc__gamma for each param_pca__n_components\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import seaborn as sns\n", + "norm = plt.Normalize(vmin=0.99, vmax=1)\n", + "for i in df['param_pca__n_components'].unique():\n", + " subset = df[df['param_pca__n_components'] == i]\n", + " pivot_table = subset.pivot(index='param_svc__gamma', columns='param_svc__C', values='mean_test_score')\n", + " plt.figure(figsize=(8, 6), dpi=300)\n", + " sns.heatmap(pivot_table, annot=True, fmt=\".4f\", cmap='RdYlGn', norm=norm, cbar=False)\n", + " # plt.title(f'Grid Search Mean Test Score (PCA components={i})')\n", + " plt.xlabel('C (log2 scale)')\n", + " plt.ylabel('Gamma (log2 scale)')\n", + " # plt.xscale('log', base=2)\n", + " # plt.yscale('log', base=2)\n", + " # set x and y ticks by converting the number into log base 2\n", + "# plt.xticks(ticks=np.arange(len(pivot_table.columns)), labels=[f\"{np.log2(c):.1f}\" for c in pivot_table.columns])\n", + "# plt.yticks(ticks=np.arange(len(pivot_table.index)), labels=[f\"{np.log2(g):.1f}\" for g in pivot_table.index])\n", + "\n", + " # make the mark ticks to be at the center of each cell range\n", + " plt.xticks(ticks=np.arange(len(pivot_table.columns)) + 0.5, labels=[f\"{np.log2(c):.1f}\" for c in pivot_table.columns], rotation=0)\n", + " plt.yticks(ticks=np.arange(len(pivot_table.index)) + 0.5, labels=[f\"{np.log2(g):.1f}\" for g in pivot_table.index])\n", + "\n", + " # add outline for the heatmap cells\n", + " for (i, j), val in np.ndenumerate(pivot_table.values):\n", + " plt.gca().add_patch(plt.Rectangle((j, i), 1, 1, fill=False, edgecolor='black', lw=0.5))\n", + "\n", + " # give mark by outline the heatmap cells with max mean_test_score\n", + " max_idx = subset['mean_test_score'].idxmax()\n", + " max_C = subset.loc[max_idx, 'param_svc__C']\n", + " max_gamma = subset.loc[max_idx, 'param_svc__gamma']\n", + " max_x = np.where(pivot_table.columns == max_C)[0][0] + 0.5\n", + " max_y = np.where(pivot_table.index == max_gamma)[0][0] + 0.5\n", + " plt.gca().add_patch(plt.Rectangle((max_x-0.5, max_y-0.5), 1, 1, fill=False, edgecolor='red', lw=2))\n", + " plt.tick_params(axis='x', length=0) # Remove x-axis tick marks\n", + " plt.tick_params(axis='y', length=0) # Remove y-axis tick marks\n", + " plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Evaluation on Fine Grid Search" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Retrain the model on the entire dataset\n", + "final_model_finegrid = Pipeline([\n", + " (\"scaler\", StandardScaler()),\n", + " (\"pca\", PCA(n_components=16)),\n", + " (\"svc\", SVC(C=2**8, gamma=2**-8, kernel='rbf'))\n", + "])\n", + "\n", + "# Fit the model on the entire dataset\n", + "final_model_finegrid.fit(X1a, y)\n", + "\n", + "# Save the final model\n", + "from joblib import dump\n", + "dump(final_model_finegrid, \"D:/thesis/models/Sensor A/finegrid_pca32_c8_g-8.joblib\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# svm_model = load('D:/thesis/models/sensor2/SVM.joblib')\n", + "model1a = load(\"D:/thesis/models/Sensor A/finegrid_pca32_c8_g-8.joblib\")\n", + "y_pred_model1a = model1a.predict(X1b)\n", + "model2a = load(\"D:/thesis/models/Sensor B/finegrid_pca16_c3_g-5.5.joblib\")\n", + "y_pred_model2a = model2a.predict(X2b)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(pd.DataFrame(classification_report(y1b, y_pred_model1a, output_dict=True)).transpose().to_latex(index=True, float_format=\"%.2f\"))\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import seaborn as sns\n", + "from sklearn.metrics import confusion_matrix\n", + "cm = confusion_matrix(y1b, y_pred_model1a, labels=[0,1,2,3,4,5,6])\n", + "plt.figure(figsize=(8, 6), dpi=300)\n", + "sns.heatmap(cm, annot=True, fmt=\"d\", cmap=\"Blues\", cbar=False)\n", + "plt.xlabel(\"Predicted Label\")\n", + "plt.ylabel(\"True Label\")\n", + "# plt.title(\"Confusion Matrix for Sensor A Baseline Model\")\n", + "# add cells outline\n", + "for i in range(cm.shape[0]):\n", + " for j in range(cm.shape[1]):\n", + " plt.gca().add_patch(plt.Rectangle((j, i), 1, 1, fill=False, edgecolor='black', lw=0.5))\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### n_components accuracy vs fit time" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Scale features\n", + "scaler = StandardScaler()\n", + "x_train1_scaled = scaler.fit_transform(x_train1)\n", + "x_test1_scaled = scaler.transform(x_test1)\n", + "\n", + "x_train2_scaled = scaler.fit_transform(x_train2)\n", + "x_test2_scaled = scaler.transform(x_test2)\n", + "\n", + "# Generate n_components list: 512 → 256 → 128 → ...\n", + "max_dim = 512 # cap at dataset dimension\n", + "n_components_list = []\n", + "n = max_dim\n", + "while n >= 1:\n", + " n_components_list.append(n)\n", + " n //= 2 # halve each time" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "from sklearn.metrics import accuracy_score\n", + "\n", + "def perform_pca_accuracy_time(n_components_list, x_train, x_test, y_train, y_test):\n", + " times = []\n", + " accuracies = []\n", + " metadata = {}\n", + "\n", + " # Scale features\n", + " scaler = StandardScaler()\n", + " x_train_scaled = scaler.fit_transform(x_train)\n", + " x_test_scaled = scaler.transform(x_test)\n", + "\n", + " # Generate n_components list: 512 → 256 → 128 → ...\n", + " max_dim = 512 # cap at dataset dimension\n", + " n_components_list = []\n", + " n = max_dim\n", + " while n >= 1:\n", + " n_components_list.append(n)\n", + " n //= 2 # halve each time\n", + " \n", + " # Evaluate PCA + SVM for each n_components [1, 2, 4, ..., 512]\n", + " for n in reversed(n_components_list):\n", + " # PCA\n", + " pca = PCA(n_components=n)\n", + " X_train_pca = pca.fit_transform(x_train_scaled)\n", + " X_test_pca = pca.transform(x_test_scaled)\n", + "\n", + " # SVM\n", + " clf = SVC(kernel=\"rbf\", gamma=GAMMA_BEST, C=C_BEST)\n", + " start = time.time()\n", + " clf.fit(X_train_pca, y_train)\n", + " fit_time = time.time() - start\n", + " \n", + " y_pred = clf.predict(X_test_pca)\n", + " acc = accuracy_score(y_test, y_pred)\n", + " \n", + " times.append(fit_time)\n", + " accuracies.append(acc)\n", + "\n", + " metadata[n] = {\n", + " \"fit_time\": fit_time,\n", + " \"accuracy\": acc\n", + " }\n", + " return metadata" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from mpl_toolkits.axes_grid1 import host_subplot\n", + "import numpy as np\n", + "\n", + "def plot_perform_pca_accuracy_time(metadata):\n", + " host = host_subplot(111)\n", + " par = host.twinx()\n", + "\n", + " host.set_xlabel(\"n_components (log2 scale)\")\n", + " host.set_ylabel(\"Fit time (s)\")\n", + " par.set_ylabel(\"Accuracy\")\n", + "\n", + " x = [np.log2(i) for i in list(metadata.keys())]\n", + "\n", + " p1, = host.plot(x, [i[1]['fit_time'] for i in metadata.items()], label=\"Fit Time\")\n", + " p2, = par.plot(x, [i[1]['accuracy'] for i in metadata.items()], label=\"Accuracy\")\n", + "\n", + " # add grid lines\n", + "\n", + " host.grid(True)\n", + "\n", + " host.legend(labelcolor=\"linecolor\")\n", + "\n", + " # rename x axis label with f\"np.exp2(x)\"\n", + " # host.set_xticklabels([f\"{np.exp2(i):.0f}\" for i in x])\n", + "\n", + " # resize x axis to include up to 10 with interval 1\n", + " host.set_xticks(np.arange(min(x), max(x)+1, 1))\n", + " host.set_ylim(0, max([i[1]['fit_time'] for i in metadata.items()])*1.1)\n", + " par.set_ylim(0, 1.1)\n", + "\n", + " # show both y value text on each point\n", + " for i in range(len(x)):\n", + " host.text(x[i], metadata[list(metadata.keys())[i]]['fit_time'], f\"{metadata[list(metadata.keys())[i]]['fit_time']:.1f}\", color=p1.get_color())\n", + " par.text(x[i], metadata[list(metadata.keys())[i]]['accuracy'], f\"{metadata[list(metadata.keys())[i]]['accuracy']:.3f}\", color=p2.get_color())\n", + "\n", + " host.yaxis.label.set_color(p1.get_color())\n", + " par.yaxis.label.set_color(p2.get_color())\n", + "\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def ratio_fit_time_accuracy(metadata):\n", + " ratios = {}\n", + " for n, values in metadata.items():\n", + " fit_time = values['fit_time']\n", + " accuracy = values['accuracy']\n", + " if accuracy != 0:\n", + " ratio = fit_time / accuracy\n", + " else:\n", + " ratio = float('inf') # Avoid division by zero\n", + " ratios[n] = ratio\n", + " return ratios" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Sensor A" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "metadata_A = perform_pca_accuracy_time(n_components_list, x_train1, x_test1, y_train1, y_test1) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_perform_pca_accuracy_time(metadata_A)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ratios_A = ratio_fit_time_accuracy(metadata_A)\n", + "# scatter line\n", + "\n", + "plt.plot([np.log2(i) for i in ratios_A.keys()], ratios_A.values(), marker='o')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Sensor B" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "metadata_B = perform_pca_accuracy_time(n_components_list, x_train2, x_test2, y_train2, y_test2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_perform_pca_accuracy_time(metadata_B) # add ratio plot between fit time and acc" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# time in ms\n", + "import time\n", + "\n", + "start_time = time.time()\n", + "print(model.predict(x_test1.iloc[0:1, :]))\n", + "print(y_test1[0:1])\n", + "end_time = time.time()\n", + "print(f\"Prediction time: {(end_time - start_time) * 1000} ms\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(pd.DataFrame(results_sensor1[0][\"classification_report\"]).transpose().to_latex(index=True, float_format=\"%.2f\", caption=\"Classification report on Dataset B\", label=\"tab:clf_report_auto\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "from sklearn.decomposition import PCA\n", + "from sklearn.preprocessing import StandardScaler\n", + "\n", + "\n", + "X = X1a\n", + "y = y\n", + "\n", + "# -----------------------\n", + "# PCA with 16 components\n", + "# (use min(n_features, 16) if fewer features)\n", + "# -----------------------\n", + "scaler = StandardScaler()\n", + "X_scaled = scaler.fit_transform(X)\n", + "\n", + "n_components = min(16, X.shape[1]) # safe guard\n", + "pca = PCA(n_components=n_components)\n", + "X_pca = pca.fit_transform(X_scaled)\n", + "\n", + "# Wrap into DataFrame\n", + "pc_df = pd.DataFrame(\n", + " X_pca,\n", + " columns=[f\"PC{i+1}\" for i in range(n_components)]\n", + ")\n", + "pc_df[\"target\"] = y\n", + "\n", + "# -----------------------\n", + "# Save scatter plots for each PC pair\n", + "# -----------------------\n", + "output_folder = \"D:/thesis/figures/pca_scatter_plots\"\n", + "os.makedirs(output_folder, exist_ok=True)\n", + "\n", + "for i in range(n_components):\n", + " for j in range(i+1, n_components):\n", + " plt.figure(figsize=(6, 5))\n", + " for label in set(y):\n", + " subset = pc_df[pc_df[\"target\"] == label]\n", + " plt.scatter(\n", + " subset[f\"PC{i+1}\"],\n", + " subset[f\"PC{j+1}\"],\n", + " label=f\"Class {label}\",\n", + " alpha=0.6\n", + " )\n", + " plt.xlabel(f\"PC{i+1} ({pca.explained_variance_ratio_[i]:.2%})\")\n", + " plt.ylabel(f\"PC{j+1} ({pca.explained_variance_ratio_[j]:.2%})\")\n", + " plt.title(f\"PCA Scatter: PC{i+1} vs PC{j+1}\")\n", + " plt.legend()\n", + " plt.tight_layout()\n", + "\n", + " # Save to folder\n", + " filename = f\"PC{i+1}_vs_PC{j+1}.png\"\n", + " plt.savefig(os.path.join(output_folder, filename))\n", + " plt.close()\n", + "\n", + "print(f\"All pairwise plots saved to folder: {output_folder}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "param_grid_B = [\n", + " { \n", + " \"svc\": [SVC()],\n", + " \"svc__kernel\": [\"rbf\"],\n", + " \"svc__C\": np.exp2(np.arange(-5, 16, 5)),\n", + " \"svc__gamma\": np.exp2(np.arange(-15, 6, 5)),\n", + " \"pca__n_components\": [512, 256, 128, 64, 32, 16, 8, 4]\n", + " },\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cv_B = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -752,12 +2472,18 @@ " # \"Decision Tree\": DecisionTreeClassifier(),\n", " # \"KNN\": KNeighborsClassifier(),\n", " # \"LDA\": LinearDiscriminantAnalysis(),\n", - " \"SVM\": SVC(),\n", - " # \"SVM with StandardScaler and PCA\": make_pipeline(\n", - " # StandardScaler(),\n", - " # PCA(n_components=10),\n", - " # SVC(kernel='rbf')\n", - " # ),\n", + " \"Grid+SVM+StandardScaler+PCA\": GridSearchCV(\n", + " Pipeline([\n", + " (\"scaler\", StandardScaler()),\n", + " (\"pca\", PCA()),\n", + " (\"svc\", SVC())\n", + " ]),\n", + " cv=cv_B,\n", + " param_grid=param_grid_B,\n", + " n_jobs=-1,\n", + " scoring='accuracy',\n", + " verbose=4\n", + " ),\n", " # \"XGBoost\": XGBClassifier()\n", "}" ] @@ -770,15 +2496,29 @@ "source": [ "results_sensor2 = []\n", "for name, model in models_sensor2.items():\n", - " res = train_and_evaluate_model(model, name, \"sensor2\", x_train2, y_train2, x_test2, y_test2, \n", - " export='D:/thesis/models/sensor2')\n", + " res = train_and_evaluate_model(model, name, \"Sensor B\", x_train2, y_train2, x_test2, y_test2, \n", + " export='D:/thesis/models/Sensor B')\n", " results_sensor2.append(res)\n", - " print(f\"{name} on sensor2: Accuracy = {res['accuracy']:.2f}%\")\n", + " print(f\"{name} on Sensor B: Accuracy = {res['accuracy']:.2f}%\")\n", "\n", "# Display result\n", "results_sensor2" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from joblib import load\n", + "\n", + "model_B: GridSearchCV = load('D:/thesis/models/Sensor B/Grid+SVM+StandardScaler+PCA.joblib')\n", + "df_B = model_B.cv_results_\n", + "df_B = pd.DataFrame(df_B)\n", + "df_B.iloc[np.argmax(df_B['mean_test_score'])]" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -959,6 +2699,15 @@ "print(latex_table)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_rounded" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1142,11 +2891,49 @@ "source": [ "y_pred_svm" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Retrain whole dataset with best params" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model_sensor1 = {\n", + " \"Grid+SVM+StandardScaler+PCA\": GridSearchCV(\n", + " Pipeline([\n", + " (\"scaler\", StandardScaler()),\n", + " (\"pca\", PCA()),\n", + " (\"svc\", SVC())\n", + " ]),\n", + " cv=cv,\n", + " param_grid=param_grid,\n", + " n_jobs=-1,\n", + " scoring='accuracy',\n", + " verbose=4\n", + " ),\n", + "}" + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "thesis", "language": "python", "name": "python3" },