Merge pull request #100 from nuluh/feature/99-exp-alternative-undamage-case-data

[EXP] Alterntive Undamage Case Data
2025-07-24 18:09:05 +07:00
parent 2dc915949b 9b018efc15
commit 80d4a66925
6 changed files with 799 additions and 860 deletions
--- a/code/notebooks/stft.ipynb
+++ b/code/notebooks/stft.ipynb
--- a/code/src/data_preprocessing.py
+++ b/code/src/data_preprocessing.py
@@ -25,9 +25,18 @@ class DamageFilesIndices(TypedDict):
    damage_index: int
    files: List[str]
 def complement_pairs(n, prefix, extension):
    """
    Return the four complement tuples for zzzBD<n>.TXT
    """
    filename = f"{prefix}{n}.{extension}" # TODO: shouldnt be hardcoded
    orig_a   = (n - 1) % 5 + 1                # 1 … 5
    for a in range(1, 6):              # a = 1 … 5
        if a != orig_a:                # skip original a
            yield (filename, [a, a + 25]) # use yield instead of return to return a generator of tuples
-def generate_df_tuples(total_dfs=30, group_size=5, prefix="zzzAD", extension="TXT", first_col_start=1, last_col_offset=25, 
+def generate_df_tuples(total_dfs, prefix, extension, first_col_start, last_col_offset, 
-                      special_groups=None, group=True):
+                      group_size=5, special_groups=None, group=True):
    """
    Generate a structured list of tuples containing DataFrame references and column indices.
@@ -37,7 +46,7 @@ def generate_df_tuples(total_dfs=30, group_size=5, prefix="zzzAD", extension="TX
        Total number of DataFrames to include in the tuples
    group_size : int, default 5
        Number of DataFrames in each group (determines the pattern repeat)
-    prefix : str, default "df"
+    prefix : str
        Prefix for DataFrame variable names
    first_col_start : int, default 1
        Starting value for the first column index (1-indexed)
@@ -68,22 +77,10 @@ def generate_df_tuples(total_dfs=30, group_size=5, prefix="zzzAD", extension="TX
    # Add special groups at specified positions (other than beginning)
    if special_groups:
-        for group in special_groups:
+        result.insert(0, special_groups)
            position = group.get('position', 0) # default value is 0 if not specified
            df_name = group['df_name']
            size = group.get('size', group_size)
            # Create the special group tuples
            special_tuples = []
            for i in range(size):
                first_col = first_col_start + i
                last_col = first_col + last_col_offset
                special_tuples.append((df_name, [first_col, last_col]))
        tuples.insert(position, special_tuples)
-    return tuples
+    return result
    # file_path = os.path.join(base_path, f"zzz{prefix}D{file_index}.TXT")
--- a/code/src/ml/inference.py
+++ b/code/src/ml/inference.py
@@ -1,6 +1,16 @@
 from src.ml.model_selection import inference_model
 from joblib import load
-model = {"SVM": "D:/thesis/models/sensor1/SVM.joblib", 
+x = 30
-         "SVM with PCA": "D:/thesis/models/sensor1/SVM with StandardScaler and PCA.joblib"}
+file = f"D:/thesis/data/dataset_B/zzzBD{x}.TXT"
 sensor = 1
 model = {"SVM": f"D:/thesis/models/sensor{sensor}/SVM.joblib", 
        "SVM with PCA": f"D:/thesis/models/sensor{sensor}/SVM with StandardScaler and PCA.joblib",
        "XGBoost": f"D:/thesis/models/sensor{sensor}/XGBoost.joblib"}
-inference_model(model["SVM"], "D:/thesis/data/dataset_A/zzzAD2.TXT", column_question=1)
+index = ((x-1) % 5) + 1
 inference_model(model["SVM"], file, column_question=index)
 print("---")
 inference_model(model["SVM with PCA"], file, column_question=index)
 print("---")
 inference_model(model["XGBoost"], file, column_question=index)
--- a/code/src/ml/model_selection.py
+++ b/code/src/ml/model_selection.py
@@ -8,7 +8,7 @@ from joblib import load
 def create_ready_data(
    stft_data_path: str,
    stratify: np.ndarray = None,
-) -> tuple:
+) -> tuple[pd.DataFrame, np.ndarray]:
    """
    Create a stratified train-test split from STFT data.
@@ -22,7 +22,7 @@ def create_ready_data(
    Returns:
    --------
    tuple
-        (X_train, X_test, y_train, y_test) - Split datasets
+        (pd.DataFrame, np.ndarray) - Combined data and corresponding labels
    """
    ready_data = []
    for file in os.listdir(stft_data_path):
@@ -155,7 +155,7 @@ def train_and_evaluate_model(
    except Exception as e:
        result["error"] = f"Training error: {str(e)}"
        return result
-def plot_confusion_matrix(results_sensor, y_test):
+def plot_confusion_matrix(results_sensor, y_test, title):
    """
    Plot confusion matrices for each model in results_sensor1.
@@ -193,8 +193,7 @@ def plot_confusion_matrix(results_sensor, y_test):
        # Plot
        disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
        disp.plot(cmap=plt.cm.Blues)  # You can change colormap
-        plt.title(f"{i['model']} {i['sensor']} Test")
+        plt.title(f"{title}")
        plt.show()
 def calculate_label_percentages(labels):
    """
@@ -255,9 +254,9 @@ def inference_model(
                            fs=1024, 
                            window=hann(1024), 
                            nperseg=1024, 
-                            noverlap=512
+                            noverlap=1024-512
                            )
    data = pd.DataFrame(np.abs(Zxx).T, columns=[f"Freq_{freq:.2f}" for freq in np.linspace(0, 1024/2, Zxx.shape[1])])
    data = data.rename(columns={"Freq_0.00": "00"}) # To match the model input format
    model = load(models)  # Load the model from the provided path
-    return calculate_label_percentages(model.predict(data))
+    return calculate_label_percentages(model.predict(data.iloc[:21,:]))
--- a/code/src/process_stft.py
+++ b/code/src/process_stft.py
@@ -6,7 +6,7 @@ import glob
 import multiprocessing  # Added import for multiprocessing
 # Define the base directory where DAMAGE_X folders are located
-damage_base_path = 'D:/thesis/data/converted/raw_B'
+damage_base_path = 'D:/thesis/data/converted/raw'
 # Define output directories for each sensor
 output_dirs = {
@@ -25,13 +25,10 @@ window = hann(window_size)
 Fs = 1024
 # Number of damage cases (adjust as needed)
-num_damage_cases = 6  # Change to 30 if you have 30 damage cases
+num_damage_cases = 0  # Change to 30 if you have 30 damage cases
 # Number of test runs per damage case
 num_test_runs = 5
 # Function to perform STFT and return magnitude
-def compute_stft(vibration_data):
+def compute_stft(vibration_data, Fs=Fs, window_size=window_size, hop_size=hop_size):
    frequencies, times, Zxx = stft(
        vibration_data, 
        fs=Fs, 
@@ -42,9 +39,13 @@ def compute_stft(vibration_data):
    stft_magnitude = np.abs(Zxx)
    return stft_magnitude.T  # Transpose to have frequencies as columns
-def process_damage_case(damage_num):
+def process_damage_case(damage_num, Fs=Fs, window_size=window_size, hop_size=hop_size, output_dirs=output_dirs):
    damage_folder = os.path.join(damage_base_path, f'DAMAGE_{damage_num}')
-    
+    if damage_num == 0:
        # Number of test runs per damage case
        num_test_runs = 120
    else:
        num_test_runs = 5
    # Check if the damage folder exists
    if not os.path.isdir(damage_folder):
        print(f"Folder {damage_folder} does not exist. Skipping...")
@@ -79,20 +80,29 @@ def process_damage_case(damage_num):
                print(f"Unexpected number of columns in {file_path}. Expected 2, got {df.shape[1]}. Skipping...")
                continue
            # Extract vibration data (assuming the second column is sensor data)
            vibration_data = df.iloc[:, 1].values
            # Perform STFT
-            stft_magnitude = compute_stft(vibration_data)
+            stft_magnitude = compute_stft(vibration_data, Fs=Fs, window_size=window_size, hop_size=hop_size)
            # Convert STFT result to DataFrame
            df_stft = pd.DataFrame(
                stft_magnitude, 
                columns=[f"Freq_{freq:.2f}" for freq in np.linspace(0, Fs/2, stft_magnitude.shape[1])]
            )
            # only inlcude 21 samples vector features for first 45 num_test_runs else include 22 samples vector features
            if damage_num == 0:
                print(f"Processing damage_num = 0, test_num = {test_num}")
                if test_num <= 45:
                    df_stft = df_stft.iloc[:22, :]
                    print(f"Reduced df_stft shape (21 samples): {df_stft.shape}")
                else:
                    df_stft = df_stft.iloc[:21, :]
                    print(f"Reduced df_stft shape (22 samples): {df_stft.shape}")
            # Append to the aggregated list
            aggregated_stft.append(df_stft)
            print(sum(df.shape[0] for df in aggregated_stft))
        # Concatenate all STFT DataFrames vertically
        if aggregated_stft:
--- a/data/QUGS/test.py
+++ b/data/QUGS/test.py
@@ -1,4 +1,4 @@
-from convert import *
+from data_preprocessing import *
 from joblib import dump, load
 # b = generate_damage_files_index(
@@ -19,8 +19,15 @@ special_groups_B = [
 ]
 # Generate the tuples with the special group
-# a = generate_df_tuples(special_groups=special_groups_A)
+a_complement = [(comp)
-b = generate_df_tuples(special_groups=special_groups_B, prefix="zzzBD")
+                for n in range(1, 31)
                for comp in complement_pairs(n)]
 a = generate_df_tuples(special_groups=a_complement, prefix="zzzAD")
 # b_complement = [(comp)
 #                 for n in range(1, 31)
 #                 for comp in complement_pairs(n)]
 # b = generate_df_tuples(special_groups=b_complement, prefix="zzzBD")
 # a = generate_damage_files_index(
@@ -32,14 +39,14 @@ b = generate_df_tuples(special_groups=special_groups_B, prefix="zzzBD")
 #     # undamage_file="zzzBU.TXT"
 # )
-# data_A = DataProcessor(file_index=a, base_path="D:/thesis/data/dataset_A", include_time=True)
+data_A = DataProcessor(file_index=a, base_path="D:/thesis/data/dataset_A", include_time=True)
 # data_A.create_vector_column(overwrite=True)
 # # data_A.create_limited_sensor_vector_column(overwrite=True)
-# data_A.export_to_csv("D:/thesis/data/converted/raw")
+data_A.export_to_csv("D:/thesis/data/converted/raw")
-data_B = DataProcessor(file_index=b, base_path="D:/thesis/data/dataset_B", include_time=True)
+# data_B = DataProcessor(file_index=b, base_path="D:/thesis/data/dataset_B", include_time=True)
 # data_B.create_vector_column(overwrite=True)
 # # data_B.create_limited_sensor_vector_column(overwrite=True)
-data_B.export_to_csv("D:/thesis/data/converted/raw_B")
+# data_B.export_to_csv("D:/thesis/data/converted/raw_B")
 # a = load("D:/cache.joblib")
 # breakpoint()