diff --git a/code/notebooks/stft.ipynb b/code/notebooks/stft.ipynb index 3e29c43..ccb1db6 100644 --- a/code/notebooks/stft.ipynb +++ b/code/notebooks/stft.ipynb @@ -717,25 +717,25 @@ "import numpy as np\n", "\n", "# Original data - Cumulative explained variance for first dataset\n", - "cumulative_variance = np.cumsum(pca.explained_variance_ratio_)\n", - "n_components_95 = np.argmax(cumulative_variance >= 0.95) + 1\n", + "cumulative_variance_1 = np.cumsum(pca1.explained_variance_ratio_)\n", + "n_components_95 = np.argmax(cumulative_variance_1 >= 0.95) + 1\n", "n_components_16 = 16\n", - "cumulative_variance_16 = cumulative_variance[n_components_16 - 1]\n", + "cumulative_variance_1_16 = cumulative_variance_1[n_components_16 - 1]\n", "\n", "# Create figure and primary axis\n", "fig, ax1 = plt.subplots(figsize=(8, 6))\n", "\n", "# Plot first dataset on primary axis\n", - "ax1.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o', linestyle='--', \n", - " label='Dataset 1', color='blue')\n", + "ax1.plot(range(1, len(cumulative_variance_1) + 1), cumulative_variance_1, linestyle='-', \n", + " label='Sensor A', color='blue')\n", "ax1.axhline(y=0.95, color='r', linestyle='--', label='95% Variance')\n", "ax1.axvline(x=n_components_95, color='g', linestyle='--', label=f'n={n_components_95}')\n", - "ax1.axvline(x=n_components_16, color='b', linestyle='--', label=f'n={n_components_16} ({cumulative_variance_16:.2f})')\n", - "ax1.axhline(y=cumulative_variance_16, color='b', linestyle='--')\n", + "ax1.axvline(x=n_components_16, color='b', linestyle='--', label=f'n={n_components_16} ({cumulative_variance_1_16:.2f})')\n", + "ax1.axhline(y=cumulative_variance_1_16, color='b', linestyle='--')\n", "\n", "# Set labels and properties for first dataset\n", "ax1.set_xlabel('Principal Component')\n", - "ax1.set_ylabel('Dataset 1 Variance Ratio', color='blue')\n", + "ax1.set_ylabel('Sensor A Variance Ratio', color='blue')\n", "ax1.tick_params(axis='y', labelcolor='blue')\n", "ax1.grid(True, alpha=0.3)\n", "\n", @@ -744,14 +744,17 @@ "\n", "# Example second dataset (replace with your actual data)\n", "# For demonstration, I'm creating synthetic data with a different scale\n", - "second_dataset = np.sqrt(cumulative_variance) # Just an example - replace with your actual data\n", "\n", + "cumulative_variance_2 = np.cumsum(pca2.explained_variance_ratio_)\n", + "n_components_95_2 = np.argmax(cumulative_variance_2 >= 0.95) + 1\n", + "n_components_16_2 = 16\n", + "cumulative_variance_2_16 = cumulative_variance_2[n_components_16_2 - 1]\n", "# Plot second dataset on secondary axis\n", - "ax2.plot(range(1, len(second_dataset) + 1), second_dataset, marker='s', linestyle='-', \n", - " color='red', label='Dataset 2')\n", + "ax2.plot(range(1, len(cumulative_variance_2) + 1), cumulative_variance_2, marker='s', linestyle='-', \n", + " color='red', label='Sensor B', alpha=0.1)\n", "\n", "# Set properties for second dataset\n", - "ax2.set_ylabel('Dataset 2 Variance Ratio', color='red')\n", + "ax2.set_ylabel('Sensor B Variance Ratio', color='red')\n", "ax2.tick_params(axis='y', labelcolor='red')\n", "\n", "# Create combined legend\n", @@ -1310,7 +1313,17 @@ "model: GridSearchCV = load('D:/thesis/models/Sensor B/Grid+SVM+StandardScaler+PCA.joblib')\n", "df = model.cv_results_\n", "df = pd.DataFrame(df)\n", - "df\n", + "# remove all data on param columns and all split score test data\n", + "cols_to_remove = [col for col in df.columns if 'split' in col ] + ['param_svc', 'params']\n", + "df_latex = df.drop(columns=cols_to_remove)\n", + "# cols_to_remove\n", + "# turn param C and gamma to log2 scale\n", + "df_latex['param_svc__C'] = np.log2(df_latex['param_svc__C'])\n", + "df_latex['param_svc__gamma'] = np.log2(df_latex['param_svc__gamma'])\n", + "df_latex\n", + "# sort by rank\n", + "df_latex = df_latex.sort_values(by='rank_test_score')\n", + "print(df_latex.to_latex(float_format=\"%.2f\", index=False, header=False))\n", "# change column \"param_svc__C\" to np.log2\n", "# df['param_svc__C'] = np.log2(df['param_svc__C'])\n", "# df['param_svc__gamma'] = np.log2(df['param_svc__gamma'])\n", @@ -1324,7 +1337,32 @@ "# Get rows where param_pca__n_components is 32\n", "# result = df[df['param_pca__n_components'] == 32]\n", "# top 10 most fit time\n", - "result.nlargest(10, 'mean_test_score')" + "# result.nlargest(10, 'mean_test_score')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "# Assuming `grid_search` is your fitted GridSearchCV object\n", + "cv_results = model.cv_results_\n", + "\n", + "# Extract mean fit time and mean score time\n", + "mean_fit_time = cv_results['mean_fit_time']\n", + "mean_score_time = cv_results['mean_score_time']\n", + "\n", + "# Number of cross-validation splits\n", + "# n_splits = model.cv\n", + "\n", + "# print(n_splits)\n", + "# Calculate total time elapsed\n", + "total_time_elapsed = np.sum((mean_fit_time + mean_score_time) * 5)\n", + "\n", + "print(f\"Total time elapsed for GridSearchCV: {total_time_elapsed:.2f} minutes\")" ] }, { @@ -1711,6 +1749,13 @@ "##### Evaluation Baseline on Dataset B" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "###### Train test split" + ] + }, { "cell_type": "code", "execution_count": null, @@ -1754,6 +1799,13 @@ "patch_sklearn()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "###### Training" + ] + }, { "cell_type": "code", "execution_count": null, @@ -1843,15 +1895,7 @@ " \"svc__C\": np.logspace(3, 7, 9, base=2),\n", " \"svc__gamma\": np.logspace(-7, -3, 9, base=2)\n", " },\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + "]\n", "cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)" ] }, @@ -1862,7 +1906,7 @@ "outputs": [], "source": [ "import sklearnex\n", - "sklearnex.patch_sklearn()\n" + "sklearnex.patch_sklearn()" ] }, { @@ -1930,6 +1974,13 @@ "results_sensor2" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Results DataFrame" + ] + }, { "cell_type": "code", "execution_count": null, @@ -1938,89 +1989,32 @@ "source": [ "from joblib import load\n", "\n", - "model: GridSearchCV = load('D:/thesis/models/Sensor A/FineGrid+SVM+StandardScaler+PCA.joblib')\n", + "model: GridSearchCV = load('D:/thesis/models/Sensor B/FineGrid+SVM+StandardScaler+PCA.joblib')\n", "df = model.cv_results_\n", "df = pd.DataFrame(df)\n", "df['param_pca__n_components'] = 32\n", + "# remove all data on param columns and all split score test data\n", + "cols_to_remove = [col for col in df.columns if 'split' in col ] + ['param_svc', 'params']\n", + "df_latex = df.drop(columns=cols_to_remove)\n", + "# cols_to_remove\n", + "# turn param C and gamma to log2 scale\n", + "df_latex['param_svc__C'] = np.log2(df_latex['param_svc__C'])\n", + "df_latex['param_svc__gamma'] = np.log2(df_latex['param_svc__gamma'])\n", + "df_latex\n", + "# sort by rank\n", + "df_latex = df_latex.sort_values(by='rank_test_score')\n", + "# print(df_latex.to_latex(float_format=\"%.2f\", index=False, header=False))\n", "# df['param_pca__n_components'].unique()\n", "# df['param_svc__C'] = np.log2(df['param_svc__C']).astype(int)\n", "# df['param_svc__gamma'] = np.log2(df['param_svc__gamma']).astype(int)\n", - "# df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "\n", - "# turn param_svc__C and param_svc__gamma to log2 scale\n", - "\n", - "# df.iloc[np.argmax(df['mean_test_score'])]\n", - "df.nlargest(10, 'mean_test_score')\n", - "\n", - "# add best c and gamma for sensor B" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "import matplotlib as mpl\n", - "\n", - "# Pivot table for contour plot with log2 transformation\n", - "pivot = df.pivot(\n", - " index='param_svc__C', \n", - " columns='param_svc__gamma', \n", - " values=\"mean_test_score\"\n", - ")\n", - "\n", - "# Create new log2-transformed indices and columns\n", - "log2_columns = np.log2(pivot.columns)\n", - "log2_indices = np.log2(pivot.index)\n", - "\n", - "\n", - "# Create a contour plot using log2-transformed data\n", - "plt.figure(figsize=(8, 6))\n", - "X, Y = np.meshgrid(log2_columns, log2_indices)\n", - "Z = pivot.values\n", - "\n", - "levels = np.linspace(0.6, Z.max(), 200) # Adjust the number of levels as needed\n", - "levels = [0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.978, 0.979, 0.9792]\n", - "# Create filled contours\n", - "# contour = plt.contourf(X, Y, Z, levels=levels, cmap=\"viridis\")\n", - "# plt.colorbar(contour, label=\"Mean Test Score (Accuracy)\")\n", - "\n", - "# Add contour lines\n", - "contour_lines = plt.contour(X, Y, Z, levels=levels, cmap='Dark2', linewidths=0.5)\n", - "plt.clabel(contour_lines, inline=True, fontsize=8, fmt=\"%.4f\")\n", - "\n", - "# Set axis labels and title\n", - "plt.title(\"GridSearchCV Mean Test Score (Accuracy)\")\n", - "plt.xlabel(\"log₂(gamma)\")\n", - "plt.ylabel(\"log₂(C)\")\n", - "\n", - "# Since we're already using log2 values, no need to transform tick labels\n", - "plt.xticks(X[0], [f\"{x:.1f}\" for x in X[0]])\n", - "plt.yticks(Y[:,0], [f\"{y:.1f}\" for y in Y[:,0]])\n", - "\n", - "print(plt.gca().get_yticklabels())\n", - "print(plt.gca().get_xticklabels())\n", - "\n", - "plt.show()" + "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "#### Heatmap" + "##### Heatmap" ] }, { @@ -2035,7 +2029,7 @@ "import seaborn as sns\n", "norm = plt.Normalize(vmin=0.99, vmax=1)\n", "for i in df['param_pca__n_components'].unique():\n", - " subset = df[df['param_pca__n_components'] == i]\n", + " subset: pd.DataFrame = df[df['param_pca__n_components'] == i]\n", " pivot_table = subset.pivot(index='param_svc__gamma', columns='param_svc__C', values='mean_test_score')\n", " plt.figure(figsize=(8, 6), dpi=300)\n", " sns.heatmap(pivot_table, annot=True, fmt=\".4f\", cmap='RdYlGn', norm=norm, cbar=False)\n", @@ -2062,17 +2056,26 @@ " max_gamma = subset.loc[max_idx, 'param_svc__gamma']\n", " max_x = np.where(pivot_table.columns == max_C)[0][0] + 0.5\n", " max_y = np.where(pivot_table.index == max_gamma)[0][0] + 0.5\n", - " plt.gca().add_patch(plt.Rectangle((max_x-0.5, max_y-0.5), 1, 1, fill=False, edgecolor='red', lw=2))\n", + " plt.gca().add_patch(plt.Rectangle((max_x-0.5, max_y-0.5), 1, 1, fill=False, edgecolor='red', lw=3))\n", + " # give mark for previous max mean_test_score from previous gridsearchcv at gamma=-5 and C=5\n", + " prev_max_idx = subset[(subset['param_svc__C'] == 2**5) & (subset['param_svc__gamma'] == 2**-5)].index\n", + " prev_max_C = subset.loc[prev_max_idx, 'param_svc__C']\n", + " prev_max_gamma = subset.loc[prev_max_idx, 'param_svc__gamma']\n", + " prev_max_x = np.where(pivot_table.columns == prev_max_C.values[0])[0][0] + 0.5\n", + " prev_max_y = np.where(pivot_table.index == prev_max_gamma.values[0])[0][0] + 0.5\n", + " # dashed rectangle\n", + " plt.gca().add_patch(plt.Rectangle((prev_max_x-0.5, prev_max_y-0.5), 1, 1, fill=False, edgecolor='blue', lw=3, linestyle='--'))\n", + " print(prev_max_C, prev_max_gamma)\n", " plt.tick_params(axis='x', length=0) # Remove x-axis tick marks\n", " plt.tick_params(axis='y', length=0) # Remove y-axis tick marks\n", - " plt.show()" + " # plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "#### Evaluation on Fine Grid Search" + "#### Evaluation Fine Grid Search on Dataset B" ] }, { @@ -2084,7 +2087,7 @@ "# Retrain the model on the entire dataset\n", "final_model_finegrid = Pipeline([\n", " (\"scaler\", StandardScaler()),\n", - " (\"pca\", PCA(n_components=16)),\n", + " (\"pca\", PCA(n_components=32)),\n", " (\"svc\", SVC(C=2**8, gamma=2**-8, kernel='rbf'))\n", "])\n", "\n", @@ -2109,6 +2112,15 @@ "y_pred_model2a = model2a.predict(X2b)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model1a.get_params()" + ] + }, { "cell_type": "code", "execution_count": null,