feat(src): implement working function for dataset B to create ready data from STFT files stft_files and add setup.py for package configuration
This commit is contained in:
0
code/src/ml/__init__.py
Normal file
0
code/src/ml/__init__.py
Normal file
@@ -1,12 +1,11 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import os
|
||||
from sklearn.model_selection import train_test_split as sklearn_split
|
||||
|
||||
|
||||
def create_train_test_split(
|
||||
ready_data: pd.DataFrame,
|
||||
test_size: float = 0.2,
|
||||
random_state: int = 42,
|
||||
def create_ready_data(
|
||||
stft_data_path: str,
|
||||
stratify: np.ndarray = None,
|
||||
) -> tuple:
|
||||
"""
|
||||
@@ -14,12 +13,8 @@ def create_train_test_split(
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
data : pd.DataFrame
|
||||
The input DataFrame containing STFT data
|
||||
test_size : float
|
||||
Proportion of data to use for testing (default: 0.2)
|
||||
random_state : int
|
||||
Random seed for reproducibility (default: 42)
|
||||
stft_data_path : str
|
||||
Path to the directory containing STFT data files (e.g. 'data/converted/raw/sensor1')
|
||||
stratify : np.ndarray, optional
|
||||
Labels to use for stratified sampling
|
||||
|
||||
@@ -28,23 +23,35 @@ def create_train_test_split(
|
||||
tuple
|
||||
(X_train, X_test, y_train, y_test) - Split datasets
|
||||
"""
|
||||
ready_data = []
|
||||
for file in os.listdir(stft_data_path):
|
||||
ready_data.append(pd.read_csv(os.path.join(stft_data_path, file)))
|
||||
|
||||
y_data = [i for i in range(len(ready_data))]
|
||||
|
||||
# Combine all dataframes in ready_data into a single dataframe
|
||||
if ready_data: # Check if the list is not empty
|
||||
# Use pandas concat function instead of iterative concatenation
|
||||
combined_data = pd.concat(ready_data, axis=0, ignore_index=True)
|
||||
|
||||
print(f"Type of combined data: {type(combined_data)}")
|
||||
print(f"Shape of combined data: {combined_data.shape}")
|
||||
else:
|
||||
print("No data available in ready_data list")
|
||||
combined_data = pd.DataFrame()
|
||||
|
||||
# Store the result in x1a for compatibility with subsequent code
|
||||
X = combined_data
|
||||
|
||||
for i in range(len(y_data)):
|
||||
y_data[i] = [y_data[i]] * ready_data[i].shape[0]
|
||||
y_data[i] = np.array(y_data[i])
|
||||
|
||||
# Extract features and labels
|
||||
X = (
|
||||
ready_data.drop("label_column", axis=1)
|
||||
if "label_column" in ready_data.columns
|
||||
else ready_data
|
||||
)
|
||||
y = ready_data["label_column"] if "label_column" in ready_data.columns else stratify
|
||||
if y_data:
|
||||
# Use numpy concatenate function instead of iterative concatenation
|
||||
y = np.concatenate(y_data, axis=0)
|
||||
else:
|
||||
print("No labels available in y_data list")
|
||||
y = np.array([])
|
||||
|
||||
# Create split
|
||||
X_train, X_test, y_train, y_test = sklearn_split(
|
||||
X, y, test_size=test_size, random_state=random_state, stratify=stratify
|
||||
)
|
||||
|
||||
return X_train, X_test, y_train, y_test
|
||||
return X, y
|
||||
|
||||
Reference in New Issue
Block a user