feat(notebooks): refactor and add rectangle of prev grid search

2025-10-15 20:19:44 +07:00
parent df38c00935
commit 06f158f143
1 changed files with 112 additions and 100 deletions
--- a/code/notebooks/stft.ipynb
+++ b/code/notebooks/stft.ipynb
@@ -717,25 +717,25 @@
    "import numpy as np\n",
    "\n",
    "# Original data - Cumulative explained variance for first dataset\n",
-    "cumulative_variance = np.cumsum(pca.explained_variance_ratio_)\n",
+    "cumulative_variance_1 = np.cumsum(pca1.explained_variance_ratio_)\n",
-    "n_components_95 = np.argmax(cumulative_variance >= 0.95) + 1\n",
+    "n_components_95 = np.argmax(cumulative_variance_1 >= 0.95) + 1\n",
    "n_components_16 = 16\n",
-    "cumulative_variance_16 = cumulative_variance[n_components_16 - 1]\n",
+    "cumulative_variance_1_16 = cumulative_variance_1[n_components_16 - 1]\n",
    "\n",
    "# Create figure and primary axis\n",
    "fig, ax1 = plt.subplots(figsize=(8, 6))\n",
    "\n",
    "# Plot first dataset on primary axis\n",
-    "ax1.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o', linestyle='--', \n",
+    "ax1.plot(range(1, len(cumulative_variance_1) + 1), cumulative_variance_1, linestyle='-', \n",
-    "         label='Dataset 1', color='blue')\n",
+    "         label='Sensor A', color='blue')\n",
    "ax1.axhline(y=0.95, color='r', linestyle='--', label='95% Variance')\n",
    "ax1.axvline(x=n_components_95, color='g', linestyle='--', label=f'n={n_components_95}')\n",
-    "ax1.axvline(x=n_components_16, color='b', linestyle='--', label=f'n={n_components_16} ({cumulative_variance_16:.2f})')\n",
+    "ax1.axvline(x=n_components_16, color='b', linestyle='--', label=f'n={n_components_16} ({cumulative_variance_1_16:.2f})')\n",
-    "ax1.axhline(y=cumulative_variance_16, color='b', linestyle='--')\n",
+    "ax1.axhline(y=cumulative_variance_1_16, color='b', linestyle='--')\n",
    "\n",
    "# Set labels and properties for first dataset\n",
    "ax1.set_xlabel('Principal Component')\n",
-    "ax1.set_ylabel('Dataset 1 Variance Ratio', color='blue')\n",
+    "ax1.set_ylabel('Sensor A Variance Ratio', color='blue')\n",
    "ax1.tick_params(axis='y', labelcolor='blue')\n",
    "ax1.grid(True, alpha=0.3)\n",
    "\n",
@@ -744,14 +744,17 @@
    "\n",
    "# Example second dataset (replace with your actual data)\n",
    "# For demonstration, I'm creating synthetic data with a different scale\n",
    "second_dataset = np.sqrt(cumulative_variance)  # Just an example - replace with your actual data\n",
    "\n",
    "cumulative_variance_2 = np.cumsum(pca2.explained_variance_ratio_)\n",
    "n_components_95_2 = np.argmax(cumulative_variance_2 >= 0.95) + 1\n",
    "n_components_16_2 = 16\n",
    "cumulative_variance_2_16 = cumulative_variance_2[n_components_16_2 - 1]\n",
    "# Plot second dataset on secondary axis\n",
-    "ax2.plot(range(1, len(second_dataset) + 1), second_dataset, marker='s', linestyle='-', \n",
+    "ax2.plot(range(1, len(cumulative_variance_2) + 1), cumulative_variance_2, marker='s', linestyle='-', \n",
-    "         color='red', label='Dataset 2')\n",
+    "         color='red', label='Sensor B', alpha=0.1)\n",
    "\n",
    "# Set properties for second dataset\n",
-    "ax2.set_ylabel('Dataset 2 Variance Ratio', color='red')\n",
+    "ax2.set_ylabel('Sensor B Variance Ratio', color='red')\n",
    "ax2.tick_params(axis='y', labelcolor='red')\n",
    "\n",
    "# Create combined legend\n",
@@ -1310,7 +1313,17 @@
    "model: GridSearchCV = load('D:/thesis/models/Sensor B/Grid+SVM+StandardScaler+PCA.joblib')\n",
    "df = model.cv_results_\n",
    "df = pd.DataFrame(df)\n",
-    "df\n",
+    "# remove all data on param columns and all split score test data\n",
    "cols_to_remove = [col for col in df.columns if 'split' in col ] + ['param_svc', 'params']\n",
    "df_latex = df.drop(columns=cols_to_remove)\n",
    "# cols_to_remove\n",
    "# turn param C and gamma to log2 scale\n",
    "df_latex['param_svc__C'] = np.log2(df_latex['param_svc__C'])\n",
    "df_latex['param_svc__gamma'] = np.log2(df_latex['param_svc__gamma'])\n",
    "df_latex\n",
    "# sort by rank\n",
    "df_latex = df_latex.sort_values(by='rank_test_score')\n",
    "print(df_latex.to_latex(float_format=\"%.2f\", index=False, header=False))\n",
    "# change column \"param_svc__C\" to np.log2\n",
    "# df['param_svc__C'] = np.log2(df['param_svc__C'])\n",
    "# df['param_svc__gamma'] = np.log2(df['param_svc__gamma'])\n",
@@ -1324,7 +1337,32 @@
    "# Get rows where param_pca__n_components is 32\n",
    "# result = df[df['param_pca__n_components'] == 32]\n",
    "# top 10 most fit time\n",
-    "result.nlargest(10, 'mean_test_score')"
+    "# result.nlargest(10, 'mean_test_score')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "# Assuming `grid_search` is your fitted GridSearchCV object\n",
    "cv_results = model.cv_results_\n",
    "\n",
    "# Extract mean fit time and mean score time\n",
    "mean_fit_time = cv_results['mean_fit_time']\n",
    "mean_score_time = cv_results['mean_score_time']\n",
    "\n",
    "# Number of cross-validation splits\n",
    "# n_splits = model.cv\n",
    "\n",
    "# print(n_splits)\n",
    "# Calculate total time elapsed\n",
    "total_time_elapsed = np.sum((mean_fit_time + mean_score_time) * 5)\n",
    "\n",
    "print(f\"Total time elapsed for GridSearchCV: {total_time_elapsed:.2f} minutes\")"
   ]
  },
  {
@@ -1711,6 +1749,13 @@
    "##### Evaluation Baseline on Dataset B"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###### Train test split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
@@ -1754,6 +1799,13 @@
    "patch_sklearn()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###### Training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
@@ -1843,15 +1895,7 @@
    "        \"svc__C\": np.logspace(3, 7, 9, base=2),\n",
    "        \"svc__gamma\": np.logspace(-7, -3, 9, base=2)\n",
    "    },\n",
-    "]"
+    "]\n",
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)"
   ]
  },
@@ -1862,7 +1906,7 @@
   "outputs": [],
   "source": [
    "import sklearnex\n",
-    "sklearnex.patch_sklearn()\n"
+    "sklearnex.patch_sklearn()"
   ]
  },
  {
@@ -1930,6 +1974,13 @@
    "results_sensor2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Results DataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
@@ -1938,89 +1989,32 @@
   "source": [
    "from joblib import load\n",
    "\n",
-    "model: GridSearchCV = load('D:/thesis/models/Sensor A/FineGrid+SVM+StandardScaler+PCA.joblib')\n",
+    "model: GridSearchCV = load('D:/thesis/models/Sensor B/FineGrid+SVM+StandardScaler+PCA.joblib')\n",
    "df = model.cv_results_\n",
    "df = pd.DataFrame(df)\n",
    "df['param_pca__n_components'] = 32\n",
    "# remove all data on param columns and all split score test data\n",
    "cols_to_remove = [col for col in df.columns if 'split' in col ] + ['param_svc', 'params']\n",
    "df_latex = df.drop(columns=cols_to_remove)\n",
    "# cols_to_remove\n",
    "# turn param C and gamma to log2 scale\n",
    "df_latex['param_svc__C'] = np.log2(df_latex['param_svc__C'])\n",
    "df_latex['param_svc__gamma'] = np.log2(df_latex['param_svc__gamma'])\n",
    "df_latex\n",
    "# sort by rank\n",
    "df_latex = df_latex.sort_values(by='rank_test_score')\n",
    "# print(df_latex.to_latex(float_format=\"%.2f\", index=False, header=False))\n",
    "# df['param_pca__n_components'].unique()\n",
    "# df['param_svc__C'] = np.log2(df['param_svc__C']).astype(int)\n",
    "# df['param_svc__gamma'] = np.log2(df['param_svc__gamma']).astype(int)\n",
-    "# df"
+    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "# turn param_svc__C and param_svc__gamma to log2 scale\n",
    "\n",
    "# df.iloc[np.argmax(df['mean_test_score'])]\n",
    "df.nlargest(10, 'mean_test_score')\n",
    "\n",
    "# add best c and gamma for sensor B"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import matplotlib as mpl\n",
    "\n",
    "# Pivot table for contour plot with log2 transformation\n",
    "pivot = df.pivot(\n",
    "    index='param_svc__C', \n",
    "    columns='param_svc__gamma', \n",
    "    values=\"mean_test_score\"\n",
    ")\n",
    "\n",
    "# Create new log2-transformed indices and columns\n",
    "log2_columns = np.log2(pivot.columns)\n",
    "log2_indices = np.log2(pivot.index)\n",
    "\n",
    "\n",
    "# Create a contour plot using log2-transformed data\n",
    "plt.figure(figsize=(8, 6))\n",
    "X, Y = np.meshgrid(log2_columns, log2_indices)\n",
    "Z = pivot.values\n",
    "\n",
    "levels = np.linspace(0.6, Z.max(), 200)  # Adjust the number of levels as needed\n",
    "levels = [0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.978, 0.979, 0.9792]\n",
    "# Create filled contours\n",
    "# contour = plt.contourf(X, Y, Z, levels=levels, cmap=\"viridis\")\n",
    "# plt.colorbar(contour, label=\"Mean Test Score (Accuracy)\")\n",
    "\n",
    "# Add contour lines\n",
    "contour_lines = plt.contour(X, Y, Z, levels=levels, cmap='Dark2', linewidths=0.5)\n",
    "plt.clabel(contour_lines, inline=True, fontsize=8, fmt=\"%.4f\")\n",
    "\n",
    "# Set axis labels and title\n",
    "plt.title(\"GridSearchCV Mean Test Score (Accuracy)\")\n",
    "plt.xlabel(\"log₂(gamma)\")\n",
    "plt.ylabel(\"log₂(C)\")\n",
    "\n",
    "# Since we're already using log2 values, no need to transform tick labels\n",
    "plt.xticks(X[0], [f\"{x:.1f}\" for x in X[0]])\n",
    "plt.yticks(Y[:,0], [f\"{y:.1f}\" for y in Y[:,0]])\n",
    "\n",
    "print(plt.gca().get_yticklabels())\n",
    "print(plt.gca().get_xticklabels())\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "#### Heatmap"
+    "##### Heatmap"
   ]
  },
  {
@@ -2035,7 +2029,7 @@
    "import seaborn as sns\n",
    "norm = plt.Normalize(vmin=0.99, vmax=1)\n",
    "for i in df['param_pca__n_components'].unique():\n",
-    "     subset = df[df['param_pca__n_components'] == i]\n",
+    "     subset: pd.DataFrame = df[df['param_pca__n_components'] == i]\n",
    "     pivot_table = subset.pivot(index='param_svc__gamma', columns='param_svc__C', values='mean_test_score')\n",
    "     plt.figure(figsize=(8, 6), dpi=300)\n",
    "     sns.heatmap(pivot_table, annot=True, fmt=\".4f\", cmap='RdYlGn', norm=norm, cbar=False)\n",
@@ -2062,17 +2056,26 @@
    "     max_gamma = subset.loc[max_idx, 'param_svc__gamma']\n",
    "     max_x = np.where(pivot_table.columns == max_C)[0][0] + 0.5\n",
    "     max_y = np.where(pivot_table.index == max_gamma)[0][0] + 0.5\n",
-    "     plt.gca().add_patch(plt.Rectangle((max_x-0.5, max_y-0.5), 1, 1, fill=False, edgecolor='red', lw=2))\n",
+    "     plt.gca().add_patch(plt.Rectangle((max_x-0.5, max_y-0.5), 1, 1, fill=False, edgecolor='red', lw=3))\n",
    "     # give mark for previous max mean_test_score from previous gridsearchcv at gamma=-5 and C=5\n",
    "     prev_max_idx = subset[(subset['param_svc__C'] == 2**5) & (subset['param_svc__gamma'] == 2**-5)].index\n",
    "     prev_max_C = subset.loc[prev_max_idx, 'param_svc__C']\n",
    "     prev_max_gamma = subset.loc[prev_max_idx, 'param_svc__gamma']\n",
    "     prev_max_x = np.where(pivot_table.columns == prev_max_C.values[0])[0][0] + 0.5\n",
    "     prev_max_y = np.where(pivot_table.index == prev_max_gamma.values[0])[0][0] + 0.5\n",
    "     # dashed rectangle\n",
    "     plt.gca().add_patch(plt.Rectangle((prev_max_x-0.5, prev_max_y-0.5), 1, 1, fill=False, edgecolor='blue', lw=3, linestyle='--'))\n",
    "     print(prev_max_C, prev_max_gamma)\n",
    "     plt.tick_params(axis='x', length=0)  # Remove x-axis tick marks\n",
    "     plt.tick_params(axis='y', length=0)  # Remove y-axis tick marks\n",
-    "     plt.show()"
+    "     # plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "#### Evaluation on Fine Grid Search"
+    "#### Evaluation Fine Grid Search on Dataset B"
   ]
  },
  {
@@ -2084,7 +2087,7 @@
    "# Retrain the model on the entire dataset\n",
    "final_model_finegrid = Pipeline([\n",
    "    (\"scaler\", StandardScaler()),\n",
-    "    (\"pca\", PCA(n_components=16)),\n",
+    "    (\"pca\", PCA(n_components=32)),\n",
    "    (\"svc\", SVC(C=2**8, gamma=2**-8, kernel='rbf'))\n",
    "])\n",
    "\n",
@@ -2109,6 +2112,15 @@
    "y_pred_model2a = model2a.predict(X2b)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model1a.get_params()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,