diff --git a/code/src/ml/__init__.py b/code/src/ml/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/code/src/ml/model_selection.py b/code/src/ml/model_selection.py index 7c97fce..6f35487 100644 --- a/code/src/ml/model_selection.py +++ b/code/src/ml/model_selection.py @@ -1,12 +1,11 @@ import numpy as np import pandas as pd +import os from sklearn.model_selection import train_test_split as sklearn_split -def create_train_test_split( - ready_data: pd.DataFrame, - test_size: float = 0.2, - random_state: int = 42, +def create_ready_data( + stft_data_path: str, stratify: np.ndarray = None, ) -> tuple: """ @@ -14,12 +13,8 @@ def create_train_test_split( Parameters: ----------- - data : pd.DataFrame - The input DataFrame containing STFT data - test_size : float - Proportion of data to use for testing (default: 0.2) - random_state : int - Random seed for reproducibility (default: 42) + stft_data_path : str + Path to the directory containing STFT data files (e.g. 'data/converted/raw/sensor1') stratify : np.ndarray, optional Labels to use for stratified sampling @@ -28,23 +23,35 @@ def create_train_test_split( tuple (X_train, X_test, y_train, y_test) - Split datasets """ + ready_data = [] + for file in os.listdir(stft_data_path): + ready_data.append(pd.read_csv(os.path.join(stft_data_path, file))) + y_data = [i for i in range(len(ready_data))] + # Combine all dataframes in ready_data into a single dataframe + if ready_data: # Check if the list is not empty + # Use pandas concat function instead of iterative concatenation + combined_data = pd.concat(ready_data, axis=0, ignore_index=True) + + print(f"Type of combined data: {type(combined_data)}") + print(f"Shape of combined data: {combined_data.shape}") + else: + print("No data available in ready_data list") + combined_data = pd.DataFrame() + + # Store the result in x1a for compatibility with subsequent code + X = combined_data + for i in range(len(y_data)): y_data[i] = [y_data[i]] * ready_data[i].shape[0] y_data[i] = np.array(y_data[i]) - # Extract features and labels - X = ( - ready_data.drop("label_column", axis=1) - if "label_column" in ready_data.columns - else ready_data - ) - y = ready_data["label_column"] if "label_column" in ready_data.columns else stratify + if y_data: + # Use numpy concatenate function instead of iterative concatenation + y = np.concatenate(y_data, axis=0) + else: + print("No labels available in y_data list") + y = np.array([]) - # Create split - X_train, X_test, y_train, y_test = sklearn_split( - X, y, test_size=test_size, random_state=random_state, stratify=stratify - ) - - return X_train, X_test, y_train, y_test + return X, y diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..ece2e52 --- /dev/null +++ b/setup.py @@ -0,0 +1,8 @@ +from setuptools import setup, find_packages + +setup( + name="thesisrepo", + version="0.1", + packages=find_packages(where="code"), + package_dir={"": "code"}, +)