Add Working Milestone with Initial Results and Model Inference #82
0
code/src/ml/__init__.py
Normal file
0
code/src/ml/__init__.py
Normal file
@@ -1,12 +1,11 @@
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import os
|
||||||
from sklearn.model_selection import train_test_split as sklearn_split
|
from sklearn.model_selection import train_test_split as sklearn_split
|
||||||
|
|
||||||
|
|
||||||
def create_train_test_split(
|
def create_ready_data(
|
||||||
ready_data: pd.DataFrame,
|
stft_data_path: str,
|
||||||
test_size: float = 0.2,
|
|
||||||
random_state: int = 42,
|
|
||||||
stratify: np.ndarray = None,
|
stratify: np.ndarray = None,
|
||||||
) -> tuple:
|
) -> tuple:
|
||||||
"""
|
"""
|
||||||
@@ -14,12 +13,8 @@ def create_train_test_split(
|
|||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
-----------
|
-----------
|
||||||
data : pd.DataFrame
|
stft_data_path : str
|
||||||
The input DataFrame containing STFT data
|
Path to the directory containing STFT data files (e.g. 'data/converted/raw/sensor1')
|
||||||
test_size : float
|
|
||||||
Proportion of data to use for testing (default: 0.2)
|
|
||||||
random_state : int
|
|
||||||
Random seed for reproducibility (default: 42)
|
|
||||||
stratify : np.ndarray, optional
|
stratify : np.ndarray, optional
|
||||||
Labels to use for stratified sampling
|
Labels to use for stratified sampling
|
||||||
|
|
||||||
@@ -28,23 +23,35 @@ def create_train_test_split(
|
|||||||
tuple
|
tuple
|
||||||
(X_train, X_test, y_train, y_test) - Split datasets
|
(X_train, X_test, y_train, y_test) - Split datasets
|
||||||
"""
|
"""
|
||||||
|
ready_data = []
|
||||||
|
for file in os.listdir(stft_data_path):
|
||||||
|
ready_data.append(pd.read_csv(os.path.join(stft_data_path, file)))
|
||||||
|
|
||||||
y_data = [i for i in range(len(ready_data))]
|
y_data = [i for i in range(len(ready_data))]
|
||||||
|
|
||||||
|
# Combine all dataframes in ready_data into a single dataframe
|
||||||
|
if ready_data: # Check if the list is not empty
|
||||||
|
# Use pandas concat function instead of iterative concatenation
|
||||||
|
combined_data = pd.concat(ready_data, axis=0, ignore_index=True)
|
||||||
|
|
||||||
|
print(f"Type of combined data: {type(combined_data)}")
|
||||||
|
print(f"Shape of combined data: {combined_data.shape}")
|
||||||
|
else:
|
||||||
|
print("No data available in ready_data list")
|
||||||
|
combined_data = pd.DataFrame()
|
||||||
|
|
||||||
|
# Store the result in x1a for compatibility with subsequent code
|
||||||
|
X = combined_data
|
||||||
|
|
||||||
for i in range(len(y_data)):
|
for i in range(len(y_data)):
|
||||||
y_data[i] = [y_data[i]] * ready_data[i].shape[0]
|
y_data[i] = [y_data[i]] * ready_data[i].shape[0]
|
||||||
y_data[i] = np.array(y_data[i])
|
y_data[i] = np.array(y_data[i])
|
||||||
|
|
||||||
# Extract features and labels
|
if y_data:
|
||||||
X = (
|
# Use numpy concatenate function instead of iterative concatenation
|
||||||
ready_data.drop("label_column", axis=1)
|
y = np.concatenate(y_data, axis=0)
|
||||||
if "label_column" in ready_data.columns
|
else:
|
||||||
else ready_data
|
print("No labels available in y_data list")
|
||||||
)
|
y = np.array([])
|
||||||
y = ready_data["label_column"] if "label_column" in ready_data.columns else stratify
|
|
||||||
|
|
||||||
# Create split
|
return X, y
|
||||||
X_train, X_test, y_train, y_test = sklearn_split(
|
|
||||||
X, y, test_size=test_size, random_state=random_state, stratify=stratify
|
|
||||||
)
|
|
||||||
|
|
||||||
return X_train, X_test, y_train, y_test
|
|
||||||
|
|||||||
Reference in New Issue
Block a user