feat(data): Enhance DataProcessor to support dynamic base path and improve data loading with error handling and memory efficiency
This commit is contained in:
@@ -104,13 +104,28 @@ def generate_df_tuples(total_dfs=30, group_size=5, prefix="zzzAD", ext="TXT", fi
|
||||
|
||||
|
||||
class DataProcessor:
|
||||
def __init__(self, file_index: DamageFilesIndices, cache_path: str = None):
|
||||
def __init__(self, file_index, cache_path: str = None, base_path: str = None):
|
||||
self.file_index = file_index
|
||||
self.base_path = base_path
|
||||
if cache_path:
|
||||
self.data = load(cache_path)
|
||||
else:
|
||||
self.data = self._load_all_data()
|
||||
self.data = self.load_data()
|
||||
|
||||
def load_data(self):
|
||||
for idxs, group in enumerate(self.file_index):
|
||||
for idx, tuple in enumerate(group):
|
||||
file_path = os.path.join(self.base_path, tuple[0]) # ('zzzAD1.TXT')
|
||||
col_indices = tuple[1] # [1, 26]
|
||||
try:
|
||||
# Read the CSV file
|
||||
df = pd.read_csv(file_path, delim_whitespace=True, skiprows=10, header=0, memory_map=True)
|
||||
self.file_index[idxs][idx] = df.iloc[:, col_indices].copy() # Extract the specified columns
|
||||
|
||||
print(f"Processed {file_path}, extracted columns: {col_indices}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing {file_path}: {str(e)}")
|
||||
def _load_dataframe(self, file_path: str) -> OriginalSingleDamageScenario:
|
||||
"""
|
||||
Loads a single data file into a pandas DataFrame.
|
||||
@@ -118,7 +133,7 @@ class DataProcessor:
|
||||
:param file_path: Path to the data file.
|
||||
:return: DataFrame containing the numerical data.
|
||||
"""
|
||||
df = pd.read_csv(file_path, delim_whitespace=True, skiprows=10, header=0, memory_map=True)
|
||||
df = pd.read_csv(file_path, delim_whitespace=True, skiprows=10, header=0, memory_map=True, nrows=1)
|
||||
return df
|
||||
|
||||
def _load_all_data(self) -> GroupDataset:
|
||||
@@ -141,6 +156,7 @@ class DataProcessor:
|
||||
|
||||
# Fill the list with data
|
||||
for group_idx, file_list in self.file_index.items():
|
||||
group_idx -= 1 # adjust due to undamage file
|
||||
data[group_idx] = [self._load_dataframe(file) for file in file_list]
|
||||
return data
|
||||
|
||||
@@ -204,7 +220,7 @@ class DataProcessor:
|
||||
x += 5
|
||||
vector_col_idx.append(c)
|
||||
y += 1
|
||||
return vector_col_idx
|
||||
return vector_col_idx # TODO: refactor this so that it returns just from first data_group without using for loops through the self.data that seems unnecessary
|
||||
|
||||
def create_vector_column(self, overwrite=True) -> List[List[List[pd.DataFrame]]]:
|
||||
"""
|
||||
@@ -212,25 +228,15 @@ class DataProcessor:
|
||||
|
||||
:param overwrite: Overwrite the original data with vector column-based data.
|
||||
"""
|
||||
idx = self._create_vector_column_index()
|
||||
# if overwrite:
|
||||
for i in range(len(self.data)):
|
||||
for j in range(len(self.data[i])):
|
||||
# Get the appropriate indices for slicing from idx
|
||||
indices = idx[j]
|
||||
idxs = self._create_vector_column_index()
|
||||
for i, group in enumerate(self.data):
|
||||
# add 1 to all indices to account for 'Time' being at position 0
|
||||
for j, df in enumerate(group):
|
||||
idx = [_ + 1 for _ in idxs[j]]
|
||||
# slice out the desired columns, copy into a fresh DataFrame,
|
||||
# then overwrite self.data[i][j] with it
|
||||
self.data[i][j] = df.iloc[:, idx].copy()
|
||||
|
||||
# Get the current DataFrame
|
||||
df = self.data[i][j]
|
||||
|
||||
# Keep the 'Time' column and select only specified 'Real' columns
|
||||
# First, we add 1 to all indices to account for 'Time' being at position 0
|
||||
real_indices = [index + 1 for index in indices]
|
||||
|
||||
# Create list with Time column index (0) and the adjusted Real indices
|
||||
all_indices = [0] + real_indices
|
||||
|
||||
# Apply the slicing
|
||||
self.data[i][j] = df.iloc[:, all_indices]
|
||||
# TODO: if !overwrite:
|
||||
|
||||
def create_limited_sensor_vector_column(self, overwrite=True):
|
||||
|
||||
Reference in New Issue
Block a user