feat(src): implement working function for dataset B to create ready data from STFT files stft_files and add setup.py for package configuration

2025-04-24 09:32:22 +07:00
parent 90a5a76609
commit cba4a00cd8
3 changed files with 38 additions and 23 deletions
--- a/code/src/ml/init.py
+++ b/code/src/ml/init.py
--- a/code/src/ml/model_selection.py
+++ b/code/src/ml/model_selection.py
@@ -1,12 +1,11 @@
 import numpy as np
 import pandas as pd
+import os
 from sklearn.model_selection import train_test_split as sklearn_split


-def create_train_test_split(
-    ready_data: pd.DataFrame,
-    test_size: float = 0.2,
-    random_state: int = 42,
+def create_ready_data(
+    stft_data_path: str,
    stratify: np.ndarray = None,
 ) -> tuple:
    """
@@ -14,12 +13,8 @@ def create_train_test_split(

    Parameters:
    -----------
-    data : pd.DataFrame
-        The input DataFrame containing STFT data
-    test_size : float
-        Proportion of data to use for testing (default: 0.2)
-    random_state : int
-        Random seed for reproducibility (default: 42)
+    stft_data_path : str
+        Path to the directory containing STFT data files (e.g. 'data/converted/raw/sensor1')
    stratify : np.ndarray, optional
        Labels to use for stratified sampling

@@ -28,23 +23,35 @@ def create_train_test_split(
    tuple
        (X_train, X_test, y_train, y_test) - Split datasets
    """
+    ready_data = []
+    for file in os.listdir(stft_data_path):
+        ready_data.append(pd.read_csv(os.path.join(stft_data_path, file)))
+
    y_data = [i for i in range(len(ready_data))]

+    # Combine all dataframes in ready_data into a single dataframe
+    if ready_data:  # Check if the list is not empty
+        # Use pandas concat function instead of iterative concatenation
+        combined_data = pd.concat(ready_data, axis=0, ignore_index=True)
+
+        print(f"Type of combined data: {type(combined_data)}")
+        print(f"Shape of combined data: {combined_data.shape}")
+    else:
+        print("No data available in ready_data list")
+        combined_data = pd.DataFrame()
+
+    # Store the result in x1a for compatibility with subsequent code
+    X = combined_data
+
    for i in range(len(y_data)):
        y_data[i] = [y_data[i]] * ready_data[i].shape[0]
        y_data[i] = np.array(y_data[i])

-    # Extract features and labels
-    X = (
-        ready_data.drop("label_column", axis=1)
-        if "label_column" in ready_data.columns
-        else ready_data
-    )
-    y = ready_data["label_column"] if "label_column" in ready_data.columns else stratify
+    if y_data:
+        # Use numpy concatenate function instead of iterative concatenation
+        y = np.concatenate(y_data, axis=0)
+    else:
+        print("No labels available in y_data list")
+        y = np.array([])

-    # Create split
-    X_train, X_test, y_train, y_test = sklearn_split(
-        X, y, test_size=test_size, random_state=random_state, stratify=stratify
-    )
-
-    return X_train, X_test, y_train, y_test
+    return X, y
--- a/setup.py
+++ b/setup.py
@@ -0,0 +1,8 @@
+from setuptools import setup, find_packages
+
+setup(
+    name="thesisrepo",
+    version="0.1",
+    packages=find_packages(where="code"),
+    package_dir={"": "code"},
+)