feat(notebooks): refactor and add rectangle of prev grid search
This commit is contained in:
@@ -717,25 +717,25 @@
|
||||
"import numpy as np\n",
|
||||
"\n",
|
||||
"# Original data - Cumulative explained variance for first dataset\n",
|
||||
"cumulative_variance = np.cumsum(pca.explained_variance_ratio_)\n",
|
||||
"n_components_95 = np.argmax(cumulative_variance >= 0.95) + 1\n",
|
||||
"cumulative_variance_1 = np.cumsum(pca1.explained_variance_ratio_)\n",
|
||||
"n_components_95 = np.argmax(cumulative_variance_1 >= 0.95) + 1\n",
|
||||
"n_components_16 = 16\n",
|
||||
"cumulative_variance_16 = cumulative_variance[n_components_16 - 1]\n",
|
||||
"cumulative_variance_1_16 = cumulative_variance_1[n_components_16 - 1]\n",
|
||||
"\n",
|
||||
"# Create figure and primary axis\n",
|
||||
"fig, ax1 = plt.subplots(figsize=(8, 6))\n",
|
||||
"\n",
|
||||
"# Plot first dataset on primary axis\n",
|
||||
"ax1.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o', linestyle='--', \n",
|
||||
" label='Dataset 1', color='blue')\n",
|
||||
"ax1.plot(range(1, len(cumulative_variance_1) + 1), cumulative_variance_1, linestyle='-', \n",
|
||||
" label='Sensor A', color='blue')\n",
|
||||
"ax1.axhline(y=0.95, color='r', linestyle='--', label='95% Variance')\n",
|
||||
"ax1.axvline(x=n_components_95, color='g', linestyle='--', label=f'n={n_components_95}')\n",
|
||||
"ax1.axvline(x=n_components_16, color='b', linestyle='--', label=f'n={n_components_16} ({cumulative_variance_16:.2f})')\n",
|
||||
"ax1.axhline(y=cumulative_variance_16, color='b', linestyle='--')\n",
|
||||
"ax1.axvline(x=n_components_16, color='b', linestyle='--', label=f'n={n_components_16} ({cumulative_variance_1_16:.2f})')\n",
|
||||
"ax1.axhline(y=cumulative_variance_1_16, color='b', linestyle='--')\n",
|
||||
"\n",
|
||||
"# Set labels and properties for first dataset\n",
|
||||
"ax1.set_xlabel('Principal Component')\n",
|
||||
"ax1.set_ylabel('Dataset 1 Variance Ratio', color='blue')\n",
|
||||
"ax1.set_ylabel('Sensor A Variance Ratio', color='blue')\n",
|
||||
"ax1.tick_params(axis='y', labelcolor='blue')\n",
|
||||
"ax1.grid(True, alpha=0.3)\n",
|
||||
"\n",
|
||||
@@ -744,14 +744,17 @@
|
||||
"\n",
|
||||
"# Example second dataset (replace with your actual data)\n",
|
||||
"# For demonstration, I'm creating synthetic data with a different scale\n",
|
||||
"second_dataset = np.sqrt(cumulative_variance) # Just an example - replace with your actual data\n",
|
||||
"\n",
|
||||
"cumulative_variance_2 = np.cumsum(pca2.explained_variance_ratio_)\n",
|
||||
"n_components_95_2 = np.argmax(cumulative_variance_2 >= 0.95) + 1\n",
|
||||
"n_components_16_2 = 16\n",
|
||||
"cumulative_variance_2_16 = cumulative_variance_2[n_components_16_2 - 1]\n",
|
||||
"# Plot second dataset on secondary axis\n",
|
||||
"ax2.plot(range(1, len(second_dataset) + 1), second_dataset, marker='s', linestyle='-', \n",
|
||||
" color='red', label='Dataset 2')\n",
|
||||
"ax2.plot(range(1, len(cumulative_variance_2) + 1), cumulative_variance_2, marker='s', linestyle='-', \n",
|
||||
" color='red', label='Sensor B', alpha=0.1)\n",
|
||||
"\n",
|
||||
"# Set properties for second dataset\n",
|
||||
"ax2.set_ylabel('Dataset 2 Variance Ratio', color='red')\n",
|
||||
"ax2.set_ylabel('Sensor B Variance Ratio', color='red')\n",
|
||||
"ax2.tick_params(axis='y', labelcolor='red')\n",
|
||||
"\n",
|
||||
"# Create combined legend\n",
|
||||
@@ -1310,7 +1313,17 @@
|
||||
"model: GridSearchCV = load('D:/thesis/models/Sensor B/Grid+SVM+StandardScaler+PCA.joblib')\n",
|
||||
"df = model.cv_results_\n",
|
||||
"df = pd.DataFrame(df)\n",
|
||||
"df\n",
|
||||
"# remove all data on param columns and all split score test data\n",
|
||||
"cols_to_remove = [col for col in df.columns if 'split' in col ] + ['param_svc', 'params']\n",
|
||||
"df_latex = df.drop(columns=cols_to_remove)\n",
|
||||
"# cols_to_remove\n",
|
||||
"# turn param C and gamma to log2 scale\n",
|
||||
"df_latex['param_svc__C'] = np.log2(df_latex['param_svc__C'])\n",
|
||||
"df_latex['param_svc__gamma'] = np.log2(df_latex['param_svc__gamma'])\n",
|
||||
"df_latex\n",
|
||||
"# sort by rank\n",
|
||||
"df_latex = df_latex.sort_values(by='rank_test_score')\n",
|
||||
"print(df_latex.to_latex(float_format=\"%.2f\", index=False, header=False))\n",
|
||||
"# change column \"param_svc__C\" to np.log2\n",
|
||||
"# df['param_svc__C'] = np.log2(df['param_svc__C'])\n",
|
||||
"# df['param_svc__gamma'] = np.log2(df['param_svc__gamma'])\n",
|
||||
@@ -1324,7 +1337,32 @@
|
||||
"# Get rows where param_pca__n_components is 32\n",
|
||||
"# result = df[df['param_pca__n_components'] == 32]\n",
|
||||
"# top 10 most fit time\n",
|
||||
"result.nlargest(10, 'mean_test_score')"
|
||||
"# result.nlargest(10, 'mean_test_score')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"\n",
|
||||
"# Assuming `grid_search` is your fitted GridSearchCV object\n",
|
||||
"cv_results = model.cv_results_\n",
|
||||
"\n",
|
||||
"# Extract mean fit time and mean score time\n",
|
||||
"mean_fit_time = cv_results['mean_fit_time']\n",
|
||||
"mean_score_time = cv_results['mean_score_time']\n",
|
||||
"\n",
|
||||
"# Number of cross-validation splits\n",
|
||||
"# n_splits = model.cv\n",
|
||||
"\n",
|
||||
"# print(n_splits)\n",
|
||||
"# Calculate total time elapsed\n",
|
||||
"total_time_elapsed = np.sum((mean_fit_time + mean_score_time) * 5)\n",
|
||||
"\n",
|
||||
"print(f\"Total time elapsed for GridSearchCV: {total_time_elapsed:.2f} minutes\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -1711,6 +1749,13 @@
|
||||
"##### Evaluation Baseline on Dataset B"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"###### Train test split"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
@@ -1754,6 +1799,13 @@
|
||||
"patch_sklearn()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"###### Training"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
@@ -1843,15 +1895,7 @@
|
||||
" \"svc__C\": np.logspace(3, 7, 9, base=2),\n",
|
||||
" \"svc__gamma\": np.logspace(-7, -3, 9, base=2)\n",
|
||||
" },\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"]\n",
|
||||
"cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)"
|
||||
]
|
||||
},
|
||||
@@ -1862,7 +1906,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import sklearnex\n",
|
||||
"sklearnex.patch_sklearn()\n"
|
||||
"sklearnex.patch_sklearn()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -1930,6 +1974,13 @@
|
||||
"results_sensor2"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"##### Results DataFrame"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
@@ -1938,89 +1989,32 @@
|
||||
"source": [
|
||||
"from joblib import load\n",
|
||||
"\n",
|
||||
"model: GridSearchCV = load('D:/thesis/models/Sensor A/FineGrid+SVM+StandardScaler+PCA.joblib')\n",
|
||||
"model: GridSearchCV = load('D:/thesis/models/Sensor B/FineGrid+SVM+StandardScaler+PCA.joblib')\n",
|
||||
"df = model.cv_results_\n",
|
||||
"df = pd.DataFrame(df)\n",
|
||||
"df['param_pca__n_components'] = 32\n",
|
||||
"# remove all data on param columns and all split score test data\n",
|
||||
"cols_to_remove = [col for col in df.columns if 'split' in col ] + ['param_svc', 'params']\n",
|
||||
"df_latex = df.drop(columns=cols_to_remove)\n",
|
||||
"# cols_to_remove\n",
|
||||
"# turn param C and gamma to log2 scale\n",
|
||||
"df_latex['param_svc__C'] = np.log2(df_latex['param_svc__C'])\n",
|
||||
"df_latex['param_svc__gamma'] = np.log2(df_latex['param_svc__gamma'])\n",
|
||||
"df_latex\n",
|
||||
"# sort by rank\n",
|
||||
"df_latex = df_latex.sort_values(by='rank_test_score')\n",
|
||||
"# print(df_latex.to_latex(float_format=\"%.2f\", index=False, header=False))\n",
|
||||
"# df['param_pca__n_components'].unique()\n",
|
||||
"# df['param_svc__C'] = np.log2(df['param_svc__C']).astype(int)\n",
|
||||
"# df['param_svc__gamma'] = np.log2(df['param_svc__gamma']).astype(int)\n",
|
||||
"# df"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"\n",
|
||||
"# turn param_svc__C and param_svc__gamma to log2 scale\n",
|
||||
"\n",
|
||||
"# df.iloc[np.argmax(df['mean_test_score'])]\n",
|
||||
"df.nlargest(10, 'mean_test_score')\n",
|
||||
"\n",
|
||||
"# add best c and gamma for sensor B"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"import numpy as np\n",
|
||||
"import matplotlib as mpl\n",
|
||||
"\n",
|
||||
"# Pivot table for contour plot with log2 transformation\n",
|
||||
"pivot = df.pivot(\n",
|
||||
" index='param_svc__C', \n",
|
||||
" columns='param_svc__gamma', \n",
|
||||
" values=\"mean_test_score\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Create new log2-transformed indices and columns\n",
|
||||
"log2_columns = np.log2(pivot.columns)\n",
|
||||
"log2_indices = np.log2(pivot.index)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Create a contour plot using log2-transformed data\n",
|
||||
"plt.figure(figsize=(8, 6))\n",
|
||||
"X, Y = np.meshgrid(log2_columns, log2_indices)\n",
|
||||
"Z = pivot.values\n",
|
||||
"\n",
|
||||
"levels = np.linspace(0.6, Z.max(), 200) # Adjust the number of levels as needed\n",
|
||||
"levels = [0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.978, 0.979, 0.9792]\n",
|
||||
"# Create filled contours\n",
|
||||
"# contour = plt.contourf(X, Y, Z, levels=levels, cmap=\"viridis\")\n",
|
||||
"# plt.colorbar(contour, label=\"Mean Test Score (Accuracy)\")\n",
|
||||
"\n",
|
||||
"# Add contour lines\n",
|
||||
"contour_lines = plt.contour(X, Y, Z, levels=levels, cmap='Dark2', linewidths=0.5)\n",
|
||||
"plt.clabel(contour_lines, inline=True, fontsize=8, fmt=\"%.4f\")\n",
|
||||
"\n",
|
||||
"# Set axis labels and title\n",
|
||||
"plt.title(\"GridSearchCV Mean Test Score (Accuracy)\")\n",
|
||||
"plt.xlabel(\"log₂(gamma)\")\n",
|
||||
"plt.ylabel(\"log₂(C)\")\n",
|
||||
"\n",
|
||||
"# Since we're already using log2 values, no need to transform tick labels\n",
|
||||
"plt.xticks(X[0], [f\"{x:.1f}\" for x in X[0]])\n",
|
||||
"plt.yticks(Y[:,0], [f\"{y:.1f}\" for y in Y[:,0]])\n",
|
||||
"\n",
|
||||
"print(plt.gca().get_yticklabels())\n",
|
||||
"print(plt.gca().get_xticklabels())\n",
|
||||
"\n",
|
||||
"plt.show()"
|
||||
"df"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Heatmap"
|
||||
"##### Heatmap"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -2035,7 +2029,7 @@
|
||||
"import seaborn as sns\n",
|
||||
"norm = plt.Normalize(vmin=0.99, vmax=1)\n",
|
||||
"for i in df['param_pca__n_components'].unique():\n",
|
||||
" subset = df[df['param_pca__n_components'] == i]\n",
|
||||
" subset: pd.DataFrame = df[df['param_pca__n_components'] == i]\n",
|
||||
" pivot_table = subset.pivot(index='param_svc__gamma', columns='param_svc__C', values='mean_test_score')\n",
|
||||
" plt.figure(figsize=(8, 6), dpi=300)\n",
|
||||
" sns.heatmap(pivot_table, annot=True, fmt=\".4f\", cmap='RdYlGn', norm=norm, cbar=False)\n",
|
||||
@@ -2062,17 +2056,26 @@
|
||||
" max_gamma = subset.loc[max_idx, 'param_svc__gamma']\n",
|
||||
" max_x = np.where(pivot_table.columns == max_C)[0][0] + 0.5\n",
|
||||
" max_y = np.where(pivot_table.index == max_gamma)[0][0] + 0.5\n",
|
||||
" plt.gca().add_patch(plt.Rectangle((max_x-0.5, max_y-0.5), 1, 1, fill=False, edgecolor='red', lw=2))\n",
|
||||
" plt.gca().add_patch(plt.Rectangle((max_x-0.5, max_y-0.5), 1, 1, fill=False, edgecolor='red', lw=3))\n",
|
||||
" # give mark for previous max mean_test_score from previous gridsearchcv at gamma=-5 and C=5\n",
|
||||
" prev_max_idx = subset[(subset['param_svc__C'] == 2**5) & (subset['param_svc__gamma'] == 2**-5)].index\n",
|
||||
" prev_max_C = subset.loc[prev_max_idx, 'param_svc__C']\n",
|
||||
" prev_max_gamma = subset.loc[prev_max_idx, 'param_svc__gamma']\n",
|
||||
" prev_max_x = np.where(pivot_table.columns == prev_max_C.values[0])[0][0] + 0.5\n",
|
||||
" prev_max_y = np.where(pivot_table.index == prev_max_gamma.values[0])[0][0] + 0.5\n",
|
||||
" # dashed rectangle\n",
|
||||
" plt.gca().add_patch(plt.Rectangle((prev_max_x-0.5, prev_max_y-0.5), 1, 1, fill=False, edgecolor='blue', lw=3, linestyle='--'))\n",
|
||||
" print(prev_max_C, prev_max_gamma)\n",
|
||||
" plt.tick_params(axis='x', length=0) # Remove x-axis tick marks\n",
|
||||
" plt.tick_params(axis='y', length=0) # Remove y-axis tick marks\n",
|
||||
" plt.show()"
|
||||
" # plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Evaluation on Fine Grid Search"
|
||||
"#### Evaluation Fine Grid Search on Dataset B"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -2084,7 +2087,7 @@
|
||||
"# Retrain the model on the entire dataset\n",
|
||||
"final_model_finegrid = Pipeline([\n",
|
||||
" (\"scaler\", StandardScaler()),\n",
|
||||
" (\"pca\", PCA(n_components=16)),\n",
|
||||
" (\"pca\", PCA(n_components=32)),\n",
|
||||
" (\"svc\", SVC(C=2**8, gamma=2**-8, kernel='rbf'))\n",
|
||||
"])\n",
|
||||
"\n",
|
||||
@@ -2109,6 +2112,15 @@
|
||||
"y_pred_model2a = model2a.predict(X2b)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model1a.get_params()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
||||
Reference in New Issue
Block a user