feat(src): implement working function for dataset B to create ready data from STFT files stft_files and add setup.py for package configuration

This commit is contained in:
nuluh
2025-04-24 09:32:22 +07:00
parent 90a5a76609
commit cba4a00cd8
3 changed files with 38 additions and 23 deletions

0
code/src/ml/__init__.py Normal file
View File

View File

@@ -1,12 +1,11 @@
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split as sklearn_split
def create_train_test_split(
ready_data: pd.DataFrame,
test_size: float = 0.2,
random_state: int = 42,
def create_ready_data(
stft_data_path: str,
stratify: np.ndarray = None,
) -> tuple:
"""
@@ -14,12 +13,8 @@ def create_train_test_split(
Parameters:
-----------
data : pd.DataFrame
The input DataFrame containing STFT data
test_size : float
Proportion of data to use for testing (default: 0.2)
random_state : int
Random seed for reproducibility (default: 42)
stft_data_path : str
Path to the directory containing STFT data files (e.g. 'data/converted/raw/sensor1')
stratify : np.ndarray, optional
Labels to use for stratified sampling
@@ -28,23 +23,35 @@ def create_train_test_split(
tuple
(X_train, X_test, y_train, y_test) - Split datasets
"""
ready_data = []
for file in os.listdir(stft_data_path):
ready_data.append(pd.read_csv(os.path.join(stft_data_path, file)))
y_data = [i for i in range(len(ready_data))]
# Combine all dataframes in ready_data into a single dataframe
if ready_data: # Check if the list is not empty
# Use pandas concat function instead of iterative concatenation
combined_data = pd.concat(ready_data, axis=0, ignore_index=True)
print(f"Type of combined data: {type(combined_data)}")
print(f"Shape of combined data: {combined_data.shape}")
else:
print("No data available in ready_data list")
combined_data = pd.DataFrame()
# Store the result in x1a for compatibility with subsequent code
X = combined_data
for i in range(len(y_data)):
y_data[i] = [y_data[i]] * ready_data[i].shape[0]
y_data[i] = np.array(y_data[i])
# Extract features and labels
X = (
ready_data.drop("label_column", axis=1)
if "label_column" in ready_data.columns
else ready_data
)
y = ready_data["label_column"] if "label_column" in ready_data.columns else stratify
if y_data:
# Use numpy concatenate function instead of iterative concatenation
y = np.concatenate(y_data, axis=0)
else:
print("No labels available in y_data list")
y = np.array([])
# Create split
X_train, X_test, y_train, y_test = sklearn_split(
X, y, test_size=test_size, random_state=random_state, stratify=stratify
)
return X_train, X_test, y_train, y_test
return X, y

8
setup.py Normal file
View File

@@ -0,0 +1,8 @@
from setuptools import setup, find_packages
setup(
name="thesisrepo",
version="0.1",
packages=find_packages(where="code"),
package_dir={"": "code"},
)