Files
thesis/code/src/ml/model_selection.py

271 lines
9.1 KiB
Python

import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from joblib import load
def create_ready_data(
stft_data_path: str,
stratify: np.ndarray = None,
) -> tuple[pd.DataFrame, np.ndarray]:
"""
Create a stratified train-test split from STFT data.
Parameters:
-----------
stft_data_path : str
Path to the directory containing STFT data files (e.g. 'data/converted/raw/sensor1')
stratify : np.ndarray, optional
Labels to use for stratified sampling
Returns:
--------
tuple
(pd.DataFrame, np.ndarray) - Combined data and corresponding labels
"""
ready_data = []
for file in os.listdir(stft_data_path):
ready_data.append(pd.read_csv(os.path.join(stft_data_path, file), skiprows=1))
y_data = [i for i in range(len(ready_data))] # TODO: Should be replaced with actual desired labels
# Combine all dataframes in ready_data into a single dataframe
if ready_data: # Check if the list is not empty
# Use pandas concat function instead of iterative concatenation
combined_data = pd.concat(ready_data, axis=0, ignore_index=True)
print(f"Type of combined data: {type(combined_data)}")
print(f"Shape of combined data: {combined_data.shape}")
else:
print("No data available in ready_data list")
combined_data = pd.DataFrame()
# Store the result in x1a for compatibility with subsequent code
X = combined_data
for i in range(len(y_data)):
y_data[i] = [y_data[i]] * ready_data[i].shape[0]
y_data[i] = np.array(y_data[i])
if y_data:
# Use numpy concatenate function instead of iterative concatenation
y = np.concatenate(y_data, axis=0)
else:
print("No labels available in y_data list")
y = np.array([])
return X, y
def train_and_evaluate_model(
model, model_name, sensor_label, x_train, y_train, x_test, y_test, export=None
):
"""
Train a machine learning model, evaluate its performance, and optionally export it.
This function trains the provided model on the training data, evaluates its
performance on test data using accuracy score, and can save the trained model
to disk if an export path is provided.
Parameters
----------
model : estimator object
The machine learning model to train.
model_name : str
Name of the model, used for the export filename and in the returned results.
sensor_label : str
Label identifying which sensor's data the model is being trained on.
x_train : array-like or pandas.DataFrame
The training input samples.
y_train : array-like
The target values for training.
x_test : array-like or pandas.DataFrame
The test input samples.
y_test : array-like
The target values for testing.
export : str, optional
Directory path where the trained model should be saved. If None, model won't be saved.
Returns
-------
dict
Dictionary containing:
- 'model': model_name (str)
- 'sensor': sensor_label (str)
- 'accuracy': accuracy percentage (float)
Example
-------
>>> from sklearn.svm import SVC
>>> from sklearn.model_selection import train_test_split
>>> X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2)
>>> result = train_and_evaluate_model(
... SVC(),
... "SVM",
... "sensor1",
... X_train,
... y_train,
... X_test,
... y_test,
... export="models/sensor1"
... )
>>> print(f"Model accuracy: {result['accuracy']:.2f}%")
"""
from sklearn.metrics import accuracy_score
result = {"model": model_name, "sensor": sensor_label, "success": False}
try:
import time
start_time = time.time()
# Train the model
model.fit(x_train, y_train)
result["elapsed_time_training"] = time.time() - start_time
try:
# Predict on the test set (validation)
start_time = time.time()
y_pred = model.predict(x_test)
result["elapsed_time_validation"] = time.time() - start_time
result["y_pred"] = y_pred # Convert to numpy array
except Exception as e:
result["error"] = f"Prediction error: {str(e)}"
return result
# Calculate accuracy
try:
accuracy = accuracy_score(y_test, y_pred) * 100
result["accuracy"] = accuracy
except Exception as e:
result["error"] = f"Accuracy calculation error: {str(e)}"
return result
# Export model if requested
if export:
try:
import joblib
full_path = os.path.join(export, f"{model_name}.joblib")
os.makedirs(os.path.dirname(full_path), exist_ok=True)
joblib.dump(model, full_path)
print(f"Model saved to {full_path}")
except Exception as e:
print(f"Warning: Failed to export model to {export}: {str(e)}")
result["export_error"] = str(e)
# Continue despite export error
result["success"] = True
return result
except Exception as e:
result["error"] = f"Training error: {str(e)}"
return result
def plot_confusion_matrix(results_sensor, y_test, title):
"""
Plot confusion matrices for each model in results_sensor1.
Parameters:
-----------
results_sensor1 : list
List of dictionaries containing model results.
x_test1 : array-like
Test input samples.
y_test : array-like
True labels for the test samples.
Returns:
--------
None
This function will display confusion matrices for each model in results_sensor1.
Example
-------
>>> results_sensor1 = [
... {'model': 'model1', 'accuracy': 95.0},
... {'model': 'model2', 'accuracy': 90.0}
... ]
>>> x_test1 = np.random.rand(100, 10) # Example test data
>>> y_test = np.random.randint(0, 2, size=100) # Example true labels
>>> plot_confusion_matrix(results_sensor1, x_test1, y_test)
"""
# Iterate through each model result and plot confusion matrix
for i in results_sensor:
model = load(f"D:/thesis/models/{i['sensor']}/{i['model']}.joblib")
cm = confusion_matrix(y_test, i['y_pred']) # -> ndarray
# get the class labels
labels = model.classes_
# Plot
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
disp.plot(cmap=plt.cm.Blues) # You can change colormap
plt.title(f"{title}")
def calculate_label_percentages(labels):
"""
Calculate and print the percentage distribution of unique labels in a numpy array.
Parameters:
labels (np.array): Input array of labels.
Returns:
None
"""
# Count occurrences of each unique label
unique, counts = np.unique(labels, return_counts=True)
# Calculate percentages
percentages = (counts / len(labels)) * 100
# Build and print the result string
result = "\n".join([f"Label {label}: {percentage:.2f}%" for label, percentage in zip(unique, percentages)])
return print(result)
def inference_model(
models, raw_file, column_question: int = None
):
"""
Perform inference using a trained machine learning model on a raw vibration data file with questioned column grid.
Parameters
----------
model : dict with some exported model path
The trained machine learning model to use for inference.
x_test : array-like or pandas.DataFrame
The input samples for which predictions are to be made.
export : str, optional
Directory path where the predictions should be saved. If None, predictions won't be saved.
Returns
-------
np.ndarray
Array of predicted values.
Example
-------
>>> from sklearn.svm import SVC
>>> model = {"SVM": "models/sensor1/SVM.joblib", "SVM with PCA": "models/sensor1/SVM_with_PCA.joblib"}
>>> inference_model(model["SVM"], "zzzAD1.TXT", column_question=1)
"""
df = pd.read_csv(raw_file, delim_whitespace=True, skiprows=10, header=0, memory_map=True)
col_idx = []
for i in range(1,6):
idx = [i, i+5, i+10, i+15, i+20, i+25]
col_idx.append(idx)
vibration_data = df.iloc[:, column_question].values
# Perform STFT
from scipy.signal import stft, hann
freq, times, Zxx = stft(
vibration_data,
fs=1024,
window=hann(1024),
nperseg=1024,
noverlap=1024-512
)
data = pd.DataFrame(np.abs(Zxx).T, columns=[f"Freq_{freq:.2f}" for freq in np.linspace(0, 1024/2, Zxx.shape[1])])
data = data.rename(columns={"Freq_0.00": "00"}) # To match the model input format
model = load(models) # Load the model from the provided path
return calculate_label_percentages(model.predict(data.iloc[:21,:]))