From 5041ee3feb44eb309097bf695ebca8f14fe8e52b Mon Sep 17 00:00:00 2001
From: nuluh <dam.ar@outlook.com>
Date: Tue, 24 Jun 2025 14:06:56 +0700
Subject: [PATCH] feat(src): add confusion matrix plotting and label percentage
 calculation

---
 code/src/ml/model_selection.py | 74 +++++++++++++++++++++++++++++++---
 1 file changed, 69 insertions(+), 5 deletions(-)

diff --git a/code/src/ml/model_selection.py b/code/src/ml/model_selection.py
index 51c9f9b..afb80a4 100644
--- a/code/src/ml/model_selection.py
+++ b/code/src/ml/model_selection.py
@@ -155,7 +155,7 @@ def train_and_evaluate_model(
     except Exception as e:
         result["error"] = f"Training error: {str(e)}"
         return result
-def plot_confusion_matrix(results_sensor, x_test, y_test):
+def plot_confusion_matrix(results_sensor, y_test):
     """
     Plot confusion matrices for each model in results_sensor1.
 
@@ -186,14 +186,78 @@ def plot_confusion_matrix(results_sensor, x_test, y_test):
     # Iterate through each model result and plot confusion matrix
     for i in results_sensor:
         model = load(f"D:/thesis/models/{i['sensor']}/{i['model']}.joblib")
-        y_pred = model.predict(x_test)
-        cm = confusion_matrix(y_test, y_pred) # -> ndarray
+        cm = confusion_matrix(y_test, i['y_pred']) # -> ndarray
 
         # get the class labels
         labels = model.classes_
-
         # Plot
         disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
         disp.plot(cmap=plt.cm.Blues)  # You can change colormap
         plt.title(f"{i['model']} {i['sensor']} Test")
-        plt.show()
\ No newline at end of file
+        plt.show()
+
+def calculate_label_percentages(labels):
+    """
+    Calculate and print the percentage distribution of unique labels in a numpy array.
+
+    Parameters:
+        labels (np.array): Input array of labels.
+
+    Returns:
+        None
+    """
+    # Count occurrences of each unique label
+    unique, counts = np.unique(labels, return_counts=True)
+
+    # Calculate percentages
+    percentages = (counts / len(labels)) * 100
+
+    # Build and print the result string
+    result = "\n".join([f"Label {label}: {percentage:.2f}%" for label, percentage in zip(unique, percentages)])
+    return print(result)
+
+def inference_model(
+    models, raw_file, column_question: int = None
+):
+    """
+    Perform inference using a trained machine learning model on a raw vibration data file with questioned column grid.
+
+    Parameters
+    ----------
+    model : dict with some exported model path
+        The trained machine learning model to use for inference.
+    x_test : array-like or pandas.DataFrame
+        The input samples for which predictions are to be made.
+    export : str, optional
+        Directory path where the predictions should be saved. If None, predictions won't be saved.
+
+    Returns
+    -------
+    np.ndarray
+        Array of predicted values.
+
+    Example
+    -------
+    >>> from sklearn.svm import SVC
+    >>> model = {"SVM": "models/sensor1/SVM.joblib", "SVM with PCA": "models/sensor1/SVM_with_PCA.joblib"}
+    >>> inference_model(model["SVM"], "zzzAD1.TXT", column_question=1)
+    """
+    df = pd.read_csv(raw_file, delim_whitespace=True, skiprows=10, header=0, memory_map=True)
+    col_idx = []
+    for i in range(1,6):
+        idx = [i, i+5, i+10, i+15, i+20, i+25]
+        col_idx.append(idx)
+    vibration_data = df.iloc[:, column_question].values
+    # Perform STFT
+    from scipy.signal import stft, hann
+    freq, times, Zxx = stft(
+                            vibration_data, 
+                            fs=1024, 
+                            window=hann(1024), 
+                            nperseg=1024, 
+                            noverlap=512
+                            )
+    data = pd.DataFrame(np.abs(Zxx).T, columns=[f"Freq_{freq:.2f}" for freq in np.linspace(0, 1024/2, Zxx.shape[1])])
+    data = data.rename(columns={"Freq_0.00": "00"}) # To match the model input format
+    model = load(models)  # Load the model from the provided path
+    return calculate_label_percentages(model.predict(data))
\ No newline at end of file