feat(src): add confusion matrix plotting and label percentage calculation

2025-06-24 14:06:56 +07:00
parent 114ab849b9
commit 5041ee3feb
1 changed files with 69 additions and 5 deletions
--- a/code/src/ml/model_selection.py
+++ b/code/src/ml/model_selection.py
@@ -155,7 +155,7 @@ def train_and_evaluate_model(
    except Exception as e:
        result["error"] = f"Training error: {str(e)}"
        return result
-def plot_confusion_matrix(results_sensor, x_test, y_test):
+def plot_confusion_matrix(results_sensor, y_test):
    """
    Plot confusion matrices for each model in results_sensor1.
@@ -186,14 +186,78 @@ def plot_confusion_matrix(results_sensor, x_test, y_test):
    # Iterate through each model result and plot confusion matrix
    for i in results_sensor:
        model = load(f"D:/thesis/models/{i['sensor']}/{i['model']}.joblib")
-        y_pred = model.predict(x_test)
+        cm = confusion_matrix(y_test, i['y_pred']) # -> ndarray
        cm = confusion_matrix(y_test, y_pred) # -> ndarray
        # get the class labels
        labels = model.classes_
        # Plot
        disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
        disp.plot(cmap=plt.cm.Blues)  # You can change colormap
        plt.title(f"{i['model']} {i['sensor']} Test")
-        plt.show()
+        plt.show()
 def calculate_label_percentages(labels):
    """
    Calculate and print the percentage distribution of unique labels in a numpy array.
    Parameters:
        labels (np.array): Input array of labels.
    Returns:
        None
    """
    # Count occurrences of each unique label
    unique, counts = np.unique(labels, return_counts=True)
    # Calculate percentages
    percentages = (counts / len(labels)) * 100
    # Build and print the result string
    result = "\n".join([f"Label {label}: {percentage:.2f}%" for label, percentage in zip(unique, percentages)])
    return print(result)
 def inference_model(
    models, raw_file, column_question: int = None
 ):
    """
    Perform inference using a trained machine learning model on a raw vibration data file with questioned column grid.
    Parameters
    ----------
    model : dict with some exported model path
        The trained machine learning model to use for inference.
    x_test : array-like or pandas.DataFrame
        The input samples for which predictions are to be made.
    export : str, optional
        Directory path where the predictions should be saved. If None, predictions won't be saved.
    Returns
    -------
    np.ndarray
        Array of predicted values.
    Example
    -------
    >>> from sklearn.svm import SVC
    >>> model = {"SVM": "models/sensor1/SVM.joblib", "SVM with PCA": "models/sensor1/SVM_with_PCA.joblib"}
    >>> inference_model(model["SVM"], "zzzAD1.TXT", column_question=1)
    """
    df = pd.read_csv(raw_file, delim_whitespace=True, skiprows=10, header=0, memory_map=True)
    col_idx = []
    for i in range(1,6):
        idx = [i, i+5, i+10, i+15, i+20, i+25]
        col_idx.append(idx)
    vibration_data = df.iloc[:, column_question].values
    # Perform STFT
    from scipy.signal import stft, hann
    freq, times, Zxx = stft(
                            vibration_data, 
                            fs=1024, 
                            window=hann(1024), 
                            nperseg=1024, 
                            noverlap=512
                            )
    data = pd.DataFrame(np.abs(Zxx).T, columns=[f"Freq_{freq:.2f}" for freq in np.linspace(0, 1024/2, Zxx.shape[1])])
    data = data.rename(columns={"Freq_0.00": "00"}) # To match the model input format
    model = load(models)  # Load the model from the provided path
    return calculate_label_percentages(model.predict(data))