2025-05-23 18:30:10 +00:00
3 changed files with 38 additions and 23 deletions
--- a/code/src/ml/init.py
+++ b/code/src/ml/init.py
--- a/code/src/ml/model_selection.py
+++ b/code/src/ml/model_selection.py
@@ -1,12 +1,11 @@
 import numpy as np
 import pandas as pd
 import os
 from sklearn.model_selection import train_test_split as sklearn_split
-def create_train_test_split(
+def create_ready_data(
-    ready_data: pd.DataFrame,
+    stft_data_path: str,
    test_size: float = 0.2,
    random_state: int = 42,
    stratify: np.ndarray = None,
 ) -> tuple:
    """
@@ -14,12 +13,8 @@ def create_train_test_split(
    Parameters:
    -----------
-    data : pd.DataFrame
+    stft_data_path : str
-        The input DataFrame containing STFT data
+        Path to the directory containing STFT data files (e.g. 'data/converted/raw/sensor1')
    test_size : float
        Proportion of data to use for testing (default: 0.2)
    random_state : int
        Random seed for reproducibility (default: 42)
    stratify : np.ndarray, optional
        Labels to use for stratified sampling
@@ -28,23 +23,35 @@ def create_train_test_split(
    tuple
        (X_train, X_test, y_train, y_test) - Split datasets
    """
    ready_data = []
    for file in os.listdir(stft_data_path):
        ready_data.append(pd.read_csv(os.path.join(stft_data_path, file)))
    y_data = [i for i in range(len(ready_data))]
    # Combine all dataframes in ready_data into a single dataframe
    if ready_data:  # Check if the list is not empty
        # Use pandas concat function instead of iterative concatenation
        combined_data = pd.concat(ready_data, axis=0, ignore_index=True)
        print(f"Type of combined data: {type(combined_data)}")
        print(f"Shape of combined data: {combined_data.shape}")
    else:
        print("No data available in ready_data list")
        combined_data = pd.DataFrame()
    # Store the result in x1a for compatibility with subsequent code
    X = combined_data
    for i in range(len(y_data)):
        y_data[i] = [y_data[i]] * ready_data[i].shape[0]
        y_data[i] = np.array(y_data[i])
-    # Extract features and labels
+    if y_data:
-    X = (
+        # Use numpy concatenate function instead of iterative concatenation
-        ready_data.drop("label_column", axis=1)
+        y = np.concatenate(y_data, axis=0)
-        if "label_column" in ready_data.columns
+    else:
-        else ready_data
+        print("No labels available in y_data list")
-    )
+        y = np.array([])
    y = ready_data["label_column"] if "label_column" in ready_data.columns else stratify
-    # Create split
+    return X, y
    X_train, X_test, y_train, y_test = sklearn_split(
        X, y, test_size=test_size, random_state=random_state, stratify=stratify
    )
    return X_train, X_test, y_train, y_test
--- a/setup.py
+++ b/setup.py
@@ -0,0 +1,8 @@
 from setuptools import setup, find_packages
 setup(
    name="thesisrepo",
    version="0.1",
    packages=find_packages(where="code"),
    package_dir={"": "code"},
 )