feat(notebooks): refactor and add rectangle of prev grid search

This commit is contained in:
nuluh
2025-10-15 20:19:44 +07:00
parent df38c00935
commit 06f158f143

View File

@@ -717,25 +717,25 @@
"import numpy as np\n", "import numpy as np\n",
"\n", "\n",
"# Original data - Cumulative explained variance for first dataset\n", "# Original data - Cumulative explained variance for first dataset\n",
"cumulative_variance = np.cumsum(pca.explained_variance_ratio_)\n", "cumulative_variance_1 = np.cumsum(pca1.explained_variance_ratio_)\n",
"n_components_95 = np.argmax(cumulative_variance >= 0.95) + 1\n", "n_components_95 = np.argmax(cumulative_variance_1 >= 0.95) + 1\n",
"n_components_16 = 16\n", "n_components_16 = 16\n",
"cumulative_variance_16 = cumulative_variance[n_components_16 - 1]\n", "cumulative_variance_1_16 = cumulative_variance_1[n_components_16 - 1]\n",
"\n", "\n",
"# Create figure and primary axis\n", "# Create figure and primary axis\n",
"fig, ax1 = plt.subplots(figsize=(8, 6))\n", "fig, ax1 = plt.subplots(figsize=(8, 6))\n",
"\n", "\n",
"# Plot first dataset on primary axis\n", "# Plot first dataset on primary axis\n",
"ax1.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o', linestyle='--', \n", "ax1.plot(range(1, len(cumulative_variance_1) + 1), cumulative_variance_1, linestyle='-', \n",
" label='Dataset 1', color='blue')\n", " label='Sensor A', color='blue')\n",
"ax1.axhline(y=0.95, color='r', linestyle='--', label='95% Variance')\n", "ax1.axhline(y=0.95, color='r', linestyle='--', label='95% Variance')\n",
"ax1.axvline(x=n_components_95, color='g', linestyle='--', label=f'n={n_components_95}')\n", "ax1.axvline(x=n_components_95, color='g', linestyle='--', label=f'n={n_components_95}')\n",
"ax1.axvline(x=n_components_16, color='b', linestyle='--', label=f'n={n_components_16} ({cumulative_variance_16:.2f})')\n", "ax1.axvline(x=n_components_16, color='b', linestyle='--', label=f'n={n_components_16} ({cumulative_variance_1_16:.2f})')\n",
"ax1.axhline(y=cumulative_variance_16, color='b', linestyle='--')\n", "ax1.axhline(y=cumulative_variance_1_16, color='b', linestyle='--')\n",
"\n", "\n",
"# Set labels and properties for first dataset\n", "# Set labels and properties for first dataset\n",
"ax1.set_xlabel('Principal Component')\n", "ax1.set_xlabel('Principal Component')\n",
"ax1.set_ylabel('Dataset 1 Variance Ratio', color='blue')\n", "ax1.set_ylabel('Sensor A Variance Ratio', color='blue')\n",
"ax1.tick_params(axis='y', labelcolor='blue')\n", "ax1.tick_params(axis='y', labelcolor='blue')\n",
"ax1.grid(True, alpha=0.3)\n", "ax1.grid(True, alpha=0.3)\n",
"\n", "\n",
@@ -744,14 +744,17 @@
"\n", "\n",
"# Example second dataset (replace with your actual data)\n", "# Example second dataset (replace with your actual data)\n",
"# For demonstration, I'm creating synthetic data with a different scale\n", "# For demonstration, I'm creating synthetic data with a different scale\n",
"second_dataset = np.sqrt(cumulative_variance) # Just an example - replace with your actual data\n",
"\n", "\n",
"cumulative_variance_2 = np.cumsum(pca2.explained_variance_ratio_)\n",
"n_components_95_2 = np.argmax(cumulative_variance_2 >= 0.95) + 1\n",
"n_components_16_2 = 16\n",
"cumulative_variance_2_16 = cumulative_variance_2[n_components_16_2 - 1]\n",
"# Plot second dataset on secondary axis\n", "# Plot second dataset on secondary axis\n",
"ax2.plot(range(1, len(second_dataset) + 1), second_dataset, marker='s', linestyle='-', \n", "ax2.plot(range(1, len(cumulative_variance_2) + 1), cumulative_variance_2, marker='s', linestyle='-', \n",
" color='red', label='Dataset 2')\n", " color='red', label='Sensor B', alpha=0.1)\n",
"\n", "\n",
"# Set properties for second dataset\n", "# Set properties for second dataset\n",
"ax2.set_ylabel('Dataset 2 Variance Ratio', color='red')\n", "ax2.set_ylabel('Sensor B Variance Ratio', color='red')\n",
"ax2.tick_params(axis='y', labelcolor='red')\n", "ax2.tick_params(axis='y', labelcolor='red')\n",
"\n", "\n",
"# Create combined legend\n", "# Create combined legend\n",
@@ -1310,7 +1313,17 @@
"model: GridSearchCV = load('D:/thesis/models/Sensor B/Grid+SVM+StandardScaler+PCA.joblib')\n", "model: GridSearchCV = load('D:/thesis/models/Sensor B/Grid+SVM+StandardScaler+PCA.joblib')\n",
"df = model.cv_results_\n", "df = model.cv_results_\n",
"df = pd.DataFrame(df)\n", "df = pd.DataFrame(df)\n",
"df\n", "# remove all data on param columns and all split score test data\n",
"cols_to_remove = [col for col in df.columns if 'split' in col ] + ['param_svc', 'params']\n",
"df_latex = df.drop(columns=cols_to_remove)\n",
"# cols_to_remove\n",
"# turn param C and gamma to log2 scale\n",
"df_latex['param_svc__C'] = np.log2(df_latex['param_svc__C'])\n",
"df_latex['param_svc__gamma'] = np.log2(df_latex['param_svc__gamma'])\n",
"df_latex\n",
"# sort by rank\n",
"df_latex = df_latex.sort_values(by='rank_test_score')\n",
"print(df_latex.to_latex(float_format=\"%.2f\", index=False, header=False))\n",
"# change column \"param_svc__C\" to np.log2\n", "# change column \"param_svc__C\" to np.log2\n",
"# df['param_svc__C'] = np.log2(df['param_svc__C'])\n", "# df['param_svc__C'] = np.log2(df['param_svc__C'])\n",
"# df['param_svc__gamma'] = np.log2(df['param_svc__gamma'])\n", "# df['param_svc__gamma'] = np.log2(df['param_svc__gamma'])\n",
@@ -1324,7 +1337,32 @@
"# Get rows where param_pca__n_components is 32\n", "# Get rows where param_pca__n_components is 32\n",
"# result = df[df['param_pca__n_components'] == 32]\n", "# result = df[df['param_pca__n_components'] == 32]\n",
"# top 10 most fit time\n", "# top 10 most fit time\n",
"result.nlargest(10, 'mean_test_score')" "# result.nlargest(10, 'mean_test_score')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"\n",
"# Assuming `grid_search` is your fitted GridSearchCV object\n",
"cv_results = model.cv_results_\n",
"\n",
"# Extract mean fit time and mean score time\n",
"mean_fit_time = cv_results['mean_fit_time']\n",
"mean_score_time = cv_results['mean_score_time']\n",
"\n",
"# Number of cross-validation splits\n",
"# n_splits = model.cv\n",
"\n",
"# print(n_splits)\n",
"# Calculate total time elapsed\n",
"total_time_elapsed = np.sum((mean_fit_time + mean_score_time) * 5)\n",
"\n",
"print(f\"Total time elapsed for GridSearchCV: {total_time_elapsed:.2f} minutes\")"
] ]
}, },
{ {
@@ -1711,6 +1749,13 @@
"##### Evaluation Baseline on Dataset B" "##### Evaluation Baseline on Dataset B"
] ]
}, },
{
"cell_type": "markdown",
"metadata": {},
"source": [
"###### Train test split"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
@@ -1754,6 +1799,13 @@
"patch_sklearn()" "patch_sklearn()"
] ]
}, },
{
"cell_type": "markdown",
"metadata": {},
"source": [
"###### Training"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
@@ -1843,15 +1895,7 @@
" \"svc__C\": np.logspace(3, 7, 9, base=2),\n", " \"svc__C\": np.logspace(3, 7, 9, base=2),\n",
" \"svc__gamma\": np.logspace(-7, -3, 9, base=2)\n", " \"svc__gamma\": np.logspace(-7, -3, 9, base=2)\n",
" },\n", " },\n",
"]" "]\n",
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)" "cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)"
] ]
}, },
@@ -1862,7 +1906,7 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"import sklearnex\n", "import sklearnex\n",
"sklearnex.patch_sklearn()\n" "sklearnex.patch_sklearn()"
] ]
}, },
{ {
@@ -1930,6 +1974,13 @@
"results_sensor2" "results_sensor2"
] ]
}, },
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Results DataFrame"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
@@ -1938,89 +1989,32 @@
"source": [ "source": [
"from joblib import load\n", "from joblib import load\n",
"\n", "\n",
"model: GridSearchCV = load('D:/thesis/models/Sensor A/FineGrid+SVM+StandardScaler+PCA.joblib')\n", "model: GridSearchCV = load('D:/thesis/models/Sensor B/FineGrid+SVM+StandardScaler+PCA.joblib')\n",
"df = model.cv_results_\n", "df = model.cv_results_\n",
"df = pd.DataFrame(df)\n", "df = pd.DataFrame(df)\n",
"df['param_pca__n_components'] = 32\n", "df['param_pca__n_components'] = 32\n",
"# remove all data on param columns and all split score test data\n",
"cols_to_remove = [col for col in df.columns if 'split' in col ] + ['param_svc', 'params']\n",
"df_latex = df.drop(columns=cols_to_remove)\n",
"# cols_to_remove\n",
"# turn param C and gamma to log2 scale\n",
"df_latex['param_svc__C'] = np.log2(df_latex['param_svc__C'])\n",
"df_latex['param_svc__gamma'] = np.log2(df_latex['param_svc__gamma'])\n",
"df_latex\n",
"# sort by rank\n",
"df_latex = df_latex.sort_values(by='rank_test_score')\n",
"# print(df_latex.to_latex(float_format=\"%.2f\", index=False, header=False))\n",
"# df['param_pca__n_components'].unique()\n", "# df['param_pca__n_components'].unique()\n",
"# df['param_svc__C'] = np.log2(df['param_svc__C']).astype(int)\n", "# df['param_svc__C'] = np.log2(df['param_svc__C']).astype(int)\n",
"# df['param_svc__gamma'] = np.log2(df['param_svc__gamma']).astype(int)\n", "# df['param_svc__gamma'] = np.log2(df['param_svc__gamma']).astype(int)\n",
"# df" "df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"\n",
"# turn param_svc__C and param_svc__gamma to log2 scale\n",
"\n",
"# df.iloc[np.argmax(df['mean_test_score'])]\n",
"df.nlargest(10, 'mean_test_score')\n",
"\n",
"# add best c and gamma for sensor B"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import matplotlib as mpl\n",
"\n",
"# Pivot table for contour plot with log2 transformation\n",
"pivot = df.pivot(\n",
" index='param_svc__C', \n",
" columns='param_svc__gamma', \n",
" values=\"mean_test_score\"\n",
")\n",
"\n",
"# Create new log2-transformed indices and columns\n",
"log2_columns = np.log2(pivot.columns)\n",
"log2_indices = np.log2(pivot.index)\n",
"\n",
"\n",
"# Create a contour plot using log2-transformed data\n",
"plt.figure(figsize=(8, 6))\n",
"X, Y = np.meshgrid(log2_columns, log2_indices)\n",
"Z = pivot.values\n",
"\n",
"levels = np.linspace(0.6, Z.max(), 200) # Adjust the number of levels as needed\n",
"levels = [0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.978, 0.979, 0.9792]\n",
"# Create filled contours\n",
"# contour = plt.contourf(X, Y, Z, levels=levels, cmap=\"viridis\")\n",
"# plt.colorbar(contour, label=\"Mean Test Score (Accuracy)\")\n",
"\n",
"# Add contour lines\n",
"contour_lines = plt.contour(X, Y, Z, levels=levels, cmap='Dark2', linewidths=0.5)\n",
"plt.clabel(contour_lines, inline=True, fontsize=8, fmt=\"%.4f\")\n",
"\n",
"# Set axis labels and title\n",
"plt.title(\"GridSearchCV Mean Test Score (Accuracy)\")\n",
"plt.xlabel(\"log₂(gamma)\")\n",
"plt.ylabel(\"log₂(C)\")\n",
"\n",
"# Since we're already using log2 values, no need to transform tick labels\n",
"plt.xticks(X[0], [f\"{x:.1f}\" for x in X[0]])\n",
"plt.yticks(Y[:,0], [f\"{y:.1f}\" for y in Y[:,0]])\n",
"\n",
"print(plt.gca().get_yticklabels())\n",
"print(plt.gca().get_xticklabels())\n",
"\n",
"plt.show()"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"#### Heatmap" "##### Heatmap"
] ]
}, },
{ {
@@ -2035,7 +2029,7 @@
"import seaborn as sns\n", "import seaborn as sns\n",
"norm = plt.Normalize(vmin=0.99, vmax=1)\n", "norm = plt.Normalize(vmin=0.99, vmax=1)\n",
"for i in df['param_pca__n_components'].unique():\n", "for i in df['param_pca__n_components'].unique():\n",
" subset = df[df['param_pca__n_components'] == i]\n", " subset: pd.DataFrame = df[df['param_pca__n_components'] == i]\n",
" pivot_table = subset.pivot(index='param_svc__gamma', columns='param_svc__C', values='mean_test_score')\n", " pivot_table = subset.pivot(index='param_svc__gamma', columns='param_svc__C', values='mean_test_score')\n",
" plt.figure(figsize=(8, 6), dpi=300)\n", " plt.figure(figsize=(8, 6), dpi=300)\n",
" sns.heatmap(pivot_table, annot=True, fmt=\".4f\", cmap='RdYlGn', norm=norm, cbar=False)\n", " sns.heatmap(pivot_table, annot=True, fmt=\".4f\", cmap='RdYlGn', norm=norm, cbar=False)\n",
@@ -2062,17 +2056,26 @@
" max_gamma = subset.loc[max_idx, 'param_svc__gamma']\n", " max_gamma = subset.loc[max_idx, 'param_svc__gamma']\n",
" max_x = np.where(pivot_table.columns == max_C)[0][0] + 0.5\n", " max_x = np.where(pivot_table.columns == max_C)[0][0] + 0.5\n",
" max_y = np.where(pivot_table.index == max_gamma)[0][0] + 0.5\n", " max_y = np.where(pivot_table.index == max_gamma)[0][0] + 0.5\n",
" plt.gca().add_patch(plt.Rectangle((max_x-0.5, max_y-0.5), 1, 1, fill=False, edgecolor='red', lw=2))\n", " plt.gca().add_patch(plt.Rectangle((max_x-0.5, max_y-0.5), 1, 1, fill=False, edgecolor='red', lw=3))\n",
" # give mark for previous max mean_test_score from previous gridsearchcv at gamma=-5 and C=5\n",
" prev_max_idx = subset[(subset['param_svc__C'] == 2**5) & (subset['param_svc__gamma'] == 2**-5)].index\n",
" prev_max_C = subset.loc[prev_max_idx, 'param_svc__C']\n",
" prev_max_gamma = subset.loc[prev_max_idx, 'param_svc__gamma']\n",
" prev_max_x = np.where(pivot_table.columns == prev_max_C.values[0])[0][0] + 0.5\n",
" prev_max_y = np.where(pivot_table.index == prev_max_gamma.values[0])[0][0] + 0.5\n",
" # dashed rectangle\n",
" plt.gca().add_patch(plt.Rectangle((prev_max_x-0.5, prev_max_y-0.5), 1, 1, fill=False, edgecolor='blue', lw=3, linestyle='--'))\n",
" print(prev_max_C, prev_max_gamma)\n",
" plt.tick_params(axis='x', length=0) # Remove x-axis tick marks\n", " plt.tick_params(axis='x', length=0) # Remove x-axis tick marks\n",
" plt.tick_params(axis='y', length=0) # Remove y-axis tick marks\n", " plt.tick_params(axis='y', length=0) # Remove y-axis tick marks\n",
" plt.show()" " # plt.show()"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"#### Evaluation on Fine Grid Search" "#### Evaluation Fine Grid Search on Dataset B"
] ]
}, },
{ {
@@ -2084,7 +2087,7 @@
"# Retrain the model on the entire dataset\n", "# Retrain the model on the entire dataset\n",
"final_model_finegrid = Pipeline([\n", "final_model_finegrid = Pipeline([\n",
" (\"scaler\", StandardScaler()),\n", " (\"scaler\", StandardScaler()),\n",
" (\"pca\", PCA(n_components=16)),\n", " (\"pca\", PCA(n_components=32)),\n",
" (\"svc\", SVC(C=2**8, gamma=2**-8, kernel='rbf'))\n", " (\"svc\", SVC(C=2**8, gamma=2**-8, kernel='rbf'))\n",
"])\n", "])\n",
"\n", "\n",
@@ -2109,6 +2112,15 @@
"y_pred_model2a = model2a.predict(X2b)" "y_pred_model2a = model2a.predict(X2b)"
] ]
}, },
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model1a.get_params()"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,