feat(data): Propose new damage file index generation to improve structure and flexibility in DataFrame handling

2025-06-16 03:13:07 +07:00
parent 3e652accfb
commit 60ff4e0fa9
1 changed files with 69 additions and 61 deletions
--- a/data/QUGS/convert.py
+++ b/data/QUGS/convert.py
@@ -26,70 +26,78 @@ class DamageFilesIndices(TypedDict):
    files: List[str]
-def generate_damage_files_index(**kwargs) -> DamageFilesIndices:
+def generate_df_tuples(total_dfs=30, group_size=5, prefix="zzzAD", ext="TXT", first_col_start=1, last_col_offset=25, 
                      special_groups=None, group=True):
    """
-    Generate a dictionary of damage scenarios with file indices.
+    Generate a structured list of tuples containing DataFrame references and column indices.
-    :param kwargs: Keyword arguments to specify parameters.
+    
-        - prefix: Prefix for the file names (default: "zzzAD").
+    Parameters:
-        - extension: File extension (default: ".TXT").
+    -----------
-        - num_damage: Number of damage scenarios.
+    total_dfs : int, default 30
-        - file_index_start: Starting index for file names.
+        Total number of DataFrames to include in the tuples
-        - col: Number of files per damage scenario.
+    group_size : int, default 5
-        - base_path: Base path for the files.
+        Number of DataFrames in each group (determines the pattern repeat)
-        - undamage_file: Name of the undamaged file with extension.
+    prefix : str, default "df"
-    :return: A dictionary where keys are damage scenario indices and values are lists of file paths.
+        Prefix for DataFrame variable names
    first_col_start : int, default 1
        Starting value for the first column index (1-indexed)
    last_col_offset : int, default 25
        Offset to add to first_col_start to get the last column index
    special_groups : list of dict, optional
        List of special groups to insert, each dict should contain:
        - 'df_name': The DataFrame name to use for all tuples in this group
        - 'position': Where to insert this group (0 for beginning)
        - 'size': Size of this group (default: same as group_size)
    Returns:
    --------
    list
        List of tuples, where each tuple contains (df_name, [first_col, last_col])
    """
    tuples = []
    # Add regular groups
    for i in range(1, total_dfs + 1):
    # for _ in range(group_size): # group tuple
        # temporary list to hold tuples for this group
        # list = []
        # Calculate the position within the group (1 to group_size)
        position_in_group = ((i - 1) % group_size) + 1
        # Calculate column indices based on position in group
        first_col = first_col_start + position_in_group - 1
        last_col = first_col + last_col_offset
        # Create the tuple with DataFrame reference and column indices
        df_name = f"{prefix}{i}.{ext}"
        tuples.append((df_name, [first_col, last_col]))
-    prefix: str = kwargs.get("prefix", "zzzAD")
+        # tuples.append(list)
-    extension: str = kwargs.get("extension", ".TXT")
+    # Add special groups at specified positions (other than beginning)
-    num_damage: int = kwargs.get("num_damage")
+    if special_groups:
-    file_index_start: int = kwargs.get("file_index_start")
+        for group in special_groups:
-    col: int = kwargs.get("col")
+            position = group.get('position', 0) # default value is 0 if not specified
-    base_path: str = kwargs.get("base_path")
+            if position > 0:
-    undamage_file: str = kwargs.get("undamage_file")
+                df_name = group['df_name']
                size = group.get('size', group_size)
                # Create the special group tuples
                special_tuples = []
                for i in range(1, size + 1):
                    first_col = first_col_start + i - 1
                    last_col = first_col + last_col_offset
                    special_tuples.append((df_name, [first_col, last_col]))
                tuples.insert(special_tuples)
    if group:
        # Group tuples into sublists of group_size
        grouped_tuples = []
        for i in range(0, len(tuples), group_size):
            grouped_tuples.append(tuples[i:i + group_size])
        return grouped_tuples
    return tuples
    damage_scenarios = {}
    a = file_index_start
    b = col + 1
    for i in range(1, num_damage + 1):
        damage_scenarios[i] = range(a, b)
        a += col
        b += col
    # return damage_scenarios
    x = {}
    if undamage_file:
        try:
            x[0] = []
            if base_path:
                x[0].append(
                    os.path.normpath(os.path.join(base_path, f"{undamage_file}"))
                )
            else:
                x[0].append(f"{prefix}{undamage_file}")
        except Exception as e:
            print(Fore.RED + f"Error processing undamaged file: {e}")
            sys.exit(1)
    else:
        print(Fore.RED + "No undamaged file specified, terminating.")
        sys.exit(1)
    for damage, files in damage_scenarios.items():
        x[damage] = []  # Initialize each key with an empty list
        for i, file_index in enumerate(files, start=1):
            if base_path:
                x[damage].append(
                    os.path.normpath(
                        os.path.join(base_path, f"{prefix}{file_index}{extension}")
                    )
                )
                # if not os.path.exists(file_path):
                #     print(Fore.RED + f"File {file_path} does not exist.")
                #     continue
            else:
                x[damage].append(f"{prefix}{file_index}{extension}")
    return x
    # file_path = os.path.join(base_path, f"zzz{prefix}D{file_index}.TXT")
    # df = pd.read_csv(file_path, sep="\t", skiprows=10)  # Read with explicit column names
@@ -189,7 +197,7 @@ class DataProcessor:
        y = 0
        for data_group in self.data:  # len(data_group[i]) = 5
            for j in data_group:  # len(j[i]) =
-                c: VectorColumnIndex = []  # column vector c_{j}
+                c: VectorColumnIndex = []
                x = 0
                for _ in range(6):  # TODO: range(6) should be dynamic and parameterized
                    c.append(x + y)