From 24c1484300b00630222acd9d2f08417ec83f50ba Mon Sep 17 00:00:00 2001 From: nuluh Date: Mon, 16 Jun 2025 17:35:27 +0700 Subject: [PATCH] feat(data): Enhance DataProcessor to support dynamic base path and improve data loading with error handling and memory efficiency --- data/QUGS/convert.py | 52 ++++++++++++++++++++++++-------------------- 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/data/QUGS/convert.py b/data/QUGS/convert.py index eba7ac9..d291a88 100644 --- a/data/QUGS/convert.py +++ b/data/QUGS/convert.py @@ -104,13 +104,28 @@ def generate_df_tuples(total_dfs=30, group_size=5, prefix="zzzAD", ext="TXT", fi class DataProcessor: - def __init__(self, file_index: DamageFilesIndices, cache_path: str = None): + def __init__(self, file_index, cache_path: str = None, base_path: str = None): self.file_index = file_index + self.base_path = base_path if cache_path: self.data = load(cache_path) else: - self.data = self._load_all_data() + self.data = self.load_data() + def load_data(self): + for idxs, group in enumerate(self.file_index): + for idx, tuple in enumerate(group): + file_path = os.path.join(self.base_path, tuple[0]) # ('zzzAD1.TXT') + col_indices = tuple[1] # [1, 26] + try: + # Read the CSV file + df = pd.read_csv(file_path, delim_whitespace=True, skiprows=10, header=0, memory_map=True) + self.file_index[idxs][idx] = df.iloc[:, col_indices].copy() # Extract the specified columns + + print(f"Processed {file_path}, extracted columns: {col_indices}") + + except Exception as e: + print(f"Error processing {file_path}: {str(e)}") def _load_dataframe(self, file_path: str) -> OriginalSingleDamageScenario: """ Loads a single data file into a pandas DataFrame. @@ -118,7 +133,7 @@ class DataProcessor: :param file_path: Path to the data file. :return: DataFrame containing the numerical data. """ - df = pd.read_csv(file_path, delim_whitespace=True, skiprows=10, header=0, memory_map=True) + df = pd.read_csv(file_path, delim_whitespace=True, skiprows=10, header=0, memory_map=True, nrows=1) return df def _load_all_data(self) -> GroupDataset: @@ -141,6 +156,7 @@ class DataProcessor: # Fill the list with data for group_idx, file_list in self.file_index.items(): + group_idx -= 1 # adjust due to undamage file data[group_idx] = [self._load_dataframe(file) for file in file_list] return data @@ -204,7 +220,7 @@ class DataProcessor: x += 5 vector_col_idx.append(c) y += 1 - return vector_col_idx + return vector_col_idx # TODO: refactor this so that it returns just from first data_group without using for loops through the self.data that seems unnecessary def create_vector_column(self, overwrite=True) -> List[List[List[pd.DataFrame]]]: """ @@ -212,26 +228,16 @@ class DataProcessor: :param overwrite: Overwrite the original data with vector column-based data. """ - idx = self._create_vector_column_index() - # if overwrite: - for i in range(len(self.data)): - for j in range(len(self.data[i])): - # Get the appropriate indices for slicing from idx - indices = idx[j] + idxs = self._create_vector_column_index() + for i, group in enumerate(self.data): + # add 1 to all indices to account for 'Time' being at position 0 + for j, df in enumerate(group): + idx = [_ + 1 for _ in idxs[j]] + # slice out the desired columns, copy into a fresh DataFrame, + # then overwrite self.data[i][j] with it + self.data[i][j] = df.iloc[:, idx].copy() - # Get the current DataFrame - df = self.data[i][j] - - # Keep the 'Time' column and select only specified 'Real' columns - # First, we add 1 to all indices to account for 'Time' being at position 0 - real_indices = [index + 1 for index in indices] - - # Create list with Time column index (0) and the adjusted Real indices - all_indices = [0] + real_indices - - # Apply the slicing - self.data[i][j] = df.iloc[:, all_indices] - # TODO: if !overwrite: + # TODO: if !overwrite: def create_limited_sensor_vector_column(self, overwrite=True): """