diff --git a/.vscode/settings.json b/.vscode/settings.json index a8b3783..5db57d3 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,4 +1,7 @@ { - "python.analysis.extraPaths": ["./code/src/features"], + "python.analysis.extraPaths": [ + "./code/src/features", + "${workspaceFolder}/code/src" + ], "jupyter.notebookFileRoot": "${workspaceFolder}/code" } diff --git a/code/notebooks/stft.ipynb b/code/notebooks/stft.ipynb index e32eda5..ca3bd5d 100644 --- a/code/notebooks/stft.ipynb +++ b/code/notebooks/stft.ipynb @@ -17,8 +17,8 @@ "metadata": {}, "outputs": [], "source": [ - "sensor1 = pd.read_csv('D:/thesis/data/converted/raw/DAMAGE_1/DAMAGE_1_TEST1_01.csv',sep=',')\n", - "sensor2 = pd.read_csv('D:/thesis/data/converted/raw/DAMAGE_1/DAMAGE_1_TEST1_02.csv',sep=',')" + "sensor1 = pd.read_csv('D:/thesis/data/converted/raw/DAMAGE_1/DAMAGE_0_TEST1_01.csv',sep=',')\n", + "sensor2 = pd.read_csv('D:/thesis/data/converted/raw/DAMAGE_1/DAMAGE_0_TEST1_02.csv',sep=',')" ] }, { @@ -101,13 +101,16 @@ "source": [ "# Combined Plot for sensor 1 and sensor 2 from data1 file in which motor is operated at 800 rpm\n", "\n", - "plt.plot(df1['s2'], label='sensor 2')\n", - "plt.plot(df1['s1'], label='sensor 1', alpha=0.5)\n", + "plt.plot(df1['s2'], label='Sensor 1', color='C1', alpha=0.6)\n", + "plt.plot(df1['s1'], label='Sensor 2', color='C0', alpha=0.6)\n", "plt.xlabel(\"Number of samples\")\n", "plt.ylabel(\"Amplitude\")\n", "plt.title(\"Raw vibration signal\")\n", "plt.ylim(-7.5, 5)\n", "plt.legend()\n", + "plt.locator_params(axis='x', nbins=8)\n", + "plt.ylim(-1, 1) # Adjust range as needed\n", + "plt.grid(True, linestyle='--', alpha=0.5)\n", "plt.show()" ] }, @@ -334,9 +337,44 @@ "metadata": {}, "outputs": [], "source": [ - "# len(ready_data1a)\n", - "# plt.pcolormesh(ready_data1[0])\n", - "ready_data1a[0].max().max()" + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "from mpl_toolkits.mplot3d import Axes3D\n", + "\n", + "# Assuming ready_data1a[0] is a DataFrame or 2D array\n", + "spectrogram_data = ready_data1a[0].values # Convert to NumPy array if it's a DataFrame\n", + "\n", + "# Get the dimensions of the spectrogram\n", + "num_frequencies, num_time_frames = spectrogram_data.shape\n", + "\n", + "# Create frequency and time arrays\n", + "frequencies = np.arange(num_frequencies) # Replace with actual frequency values if available\n", + "time_frames = np.arange(num_time_frames) # Replace with actual time values if available\n", + "\n", + "# Create a meshgrid for plotting\n", + "T, F = np.meshgrid(time_frames, frequencies)\n", + "\n", + "# Create a 3D plot\n", + "fig = plt.figure(figsize=(12, 8))\n", + "ax = fig.add_subplot(111, projection='3d')\n", + "\n", + "# Plot the surface\n", + "surf = ax.plot_surface(T, F, spectrogram_data, cmap='bwr', edgecolor='none')\n", + "\n", + "# Add labels and a color bar\n", + "ax.set_xlabel('Time Frames')\n", + "ax.set_ylabel('Frequency [Hz]')\n", + "ax.set_zlabel('Magnitude')\n", + "ax.set_title('3D Spectrogram')\n", + "# Resize the z-axis (shrink it)\n", + "z_min, z_max = 0, 0.1 # Replace with your desired range\n", + "ax.set_zlim(z_min, z_max)\n", + "ax.get_proj = lambda: np.dot(Axes3D.get_proj(ax), np.diag([1, 1, 0.5, 1])) # Shrink z-axis by 50%\n", + "ax.set_facecolor('white')\n", + "fig.colorbar(surf, ax=ax, shrink=0.5, aspect=10)\n", + "\n", + "# Show the plot\n", + "plt.show()" ] }, { @@ -345,13 +383,32 @@ "metadata": {}, "outputs": [], "source": [ + "from cmcrameri import cm\n", + "# Create a figure and subplots\n", + "fig, axes = plt.subplots(2, 3, figsize=(15, 8), sharex=True, sharey=True)\n", + "\n", + "# Flatten the axes array for easier iteration\n", + "axes = axes.flatten()\n", + "\n", + "# Loop through each subplot and plot the data\n", "for i in range(6):\n", - " plt.pcolormesh(ready_data1a[i], cmap=\"jet\", vmax=0.03, vmin=0.0)\n", - " plt.colorbar() \n", - " plt.title(f'STFT Magnitude for case {i} sensor 1')\n", - " plt.xlabel(f'Frequency [Hz]')\n", - " plt.ylabel(f'Time [sec]')\n", - " plt.show()" + " pcm = axes[i].pcolormesh(ready_data1a[i].transpose(), cmap='bwr', vmax=0.03, vmin=0.0)\n", + " axes[i].set_title(f'Case {i} Sensor A', fontsize=12)\n", + "\n", + "# Add a single color bar for all subplots\n", + "# Use the first `pcolormesh` object (or any valid one) for the color bar\n", + "cbar = fig.colorbar(pcm, ax=axes, orientation='vertical')\n", + "# cbar.set_label('Magnitude')\n", + "\n", + "# Set shared labels\n", + "fig.text(0.5, 0.04, 'Time Frames', ha='center', fontsize=12)\n", + "fig.text(0.04, 0.5, 'Frequency [Hz]', va='center', rotation='vertical', fontsize=12)\n", + "\n", + "# Adjust layout\n", + "# plt.tight_layout(rect=[0.05, 0.05, 1, 1]) # Leave space for shared labels\n", + "plt.subplots_adjust(left=0.1, right=0.75, top=0.9, bottom=0.1, wspace=0.2, hspace=0.2)\n", + "\n", + "plt.show()" ] }, { @@ -576,6 +633,16 @@ "X2a, y = create_ready_data('D:/thesis/data/converted/raw/sensor2')" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X1a.iloc[-1,:]\n", + "# y[2565]" + ] + }, { "cell_type": "code", "execution_count": null, @@ -621,23 +688,12 @@ "metadata": {}, "outputs": [], "source": [ - "def train_and_evaluate_model(model, model_name, sensor_label, x_train, y_train, x_test, y_test):\n", - " model.fit(x_train, y_train)\n", - " y_pred = model.predict(x_test)\n", - " accuracy = accuracy_score(y_test, y_pred) * 100\n", - " return {\n", - " \"model\": model_name,\n", - " \"sensor\": sensor_label,\n", - " \"accuracy\": accuracy\n", - " }" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + "from src.ml.model_selection import train_and_evaluate_model\n", + "from sklearn.svm import SVC\n", + "from sklearn.pipeline import make_pipeline\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.svm import SVC\n", + "from sklearn.decomposition import PCA\n", "# Define models for sensor1\n", "models_sensor1 = {\n", " # \"Random Forest\": RandomForestClassifier(),\n", @@ -646,12 +702,18 @@ " # \"KNN\": KNeighborsClassifier(),\n", " # \"LDA\": LinearDiscriminantAnalysis(),\n", " \"SVM\": SVC(),\n", - " \"XGBoost\": XGBClassifier()\n", + " \"SVM with StandardScaler and PCA\": make_pipeline(\n", + " StandardScaler(),\n", + " PCA(n_components=10),\n", + " SVC(kernel='rbf')\n", + " ),\n", + "\n", + " # \"XGBoost\": XGBClassifier()\n", "}\n", "\n", "results_sensor1 = []\n", "for name, model in models_sensor1.items():\n", - " res = train_and_evaluate_model(model, name, \"sensor1\", x_train1, y_train, x_test1, y_test)\n", + " res = train_and_evaluate_model(model, name, \"sensor1\", x_train1, y_train, x_test1, y_test, export='D:/thesis/models/sensor1')\n", " results_sensor1.append(res)\n", " print(f\"{name} on sensor1: Accuracy = {res['accuracy']:.2f}%\")\n" ] @@ -669,12 +731,17 @@ " # \"KNN\": KNeighborsClassifier(),\n", " # \"LDA\": LinearDiscriminantAnalysis(),\n", " \"SVM\": SVC(),\n", - " \"XGBoost\": XGBClassifier()\n", + " \"SVM with StandardScaler and PCA\": make_pipeline(\n", + " StandardScaler(),\n", + " PCA(n_components=10),\n", + " SVC(kernel='rbf')\n", + " ),\n", + " # \"XGBoost\": XGBClassifier()\n", "}\n", "\n", "results_sensor2 = []\n", "for name, model in models_sensor2.items():\n", - " res = train_and_evaluate_model(model, name, \"sensor2\", x_train2, y_train, x_test2, y_test)\n", + " res = train_and_evaluate_model(model, name, \"sensor2\", x_train2, y_train, x_test2, y_test, export='D:/thesis/models/sensor2')\n", " results_sensor2.append(res)\n", " print(f\"{name} on sensor2: Accuracy = {res['accuracy']:.2f}%\")\n" ] @@ -787,6 +854,8 @@ "source": [ "from sklearn.metrics import accuracy_score, classification_report\n", "# 4. Validate on Dataset B\n", + "from joblib import load\n", + "svm_model = load('D:/thesis/models/sensor1/SVM.joblib')\n", "y_pred_svm = svm_model.predict(X1b)\n", "\n", "# 5. Evaluate\n", @@ -794,6 +863,30 @@ "print(classification_report(y, y_pred_svm))" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Model sensor 1 to predict sensor 2 data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import accuracy_score, classification_report\n", + "# 4. Validate on Dataset B\n", + "from joblib import load\n", + "svm_model = load('D:/thesis/models/sensor1/SVM.joblib')\n", + "y_pred_svm = svm_model.predict(X2b)\n", + "\n", + "# 5. Evaluate\n", + "print(\"Accuracy on Dataset B:\", accuracy_score(y, y_pred_svm))\n", + "print(classification_report(y, y_pred_svm))" + ] + }, { "cell_type": "code", "execution_count": null, @@ -853,7 +946,7 @@ "# Plot\n", "disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)\n", "disp.plot(cmap=plt.cm.Blues) # You can change colormap\n", - "plt.title(\"SVM Sensor1 CM Train w/ Dataset A Val w/ Dataset B\")\n", + "plt.title(\"SVM Sensor1 CM Train w/ Dataset A Val w/ Dataset B from Sensor2 readings\")\n", "plt.show()" ] }, @@ -871,14 +964,14 @@ "outputs": [], "source": [ "# 1. Predict sensor 1 on Dataset A\n", - "y_train_pred = svm_model.predict(x_train1)\n", + "y_test_pred = svm_model.predict(x_test1)\n", "\n", "# 2. Import confusion matrix tools\n", "from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n", "import matplotlib.pyplot as plt\n", "\n", "# 3. Create and plot confusion matrix\n", - "cm_train = confusion_matrix(y_train, y_train_pred)\n", + "cm_train = confusion_matrix(y_test, y_test_pred)\n", "labels = svm_model.classes_\n", "\n", "disp = ConfusionMatrixDisplay(confusion_matrix=cm_train, display_labels=labels)\n", diff --git a/code/src/ml/model_selection.py b/code/src/ml/model_selection.py index 6f35487..3dc73ff 100644 --- a/code/src/ml/model_selection.py +++ b/code/src/ml/model_selection.py @@ -25,9 +25,9 @@ def create_ready_data( """ ready_data = [] for file in os.listdir(stft_data_path): - ready_data.append(pd.read_csv(os.path.join(stft_data_path, file))) + ready_data.append(pd.read_csv(os.path.join(stft_data_path, file), skiprows=1)) - y_data = [i for i in range(len(ready_data))] + y_data = [i for i in range(len(ready_data))] # TODO: Should be replaced with actual desired labels # Combine all dataframes in ready_data into a single dataframe if ready_data: # Check if the list is not empty @@ -55,3 +55,101 @@ def create_ready_data( y = np.array([]) return X, y + + +def train_and_evaluate_model( + model, model_name, sensor_label, x_train, y_train, x_test, y_test, export=None +): + """ + Train a machine learning model, evaluate its performance, and optionally export it. + + This function trains the provided model on the training data, evaluates its + performance on test data using accuracy score, and can save the trained model + to disk if an export path is provided. + + Parameters + ---------- + model : estimator object + The machine learning model to train. + model_name : str + Name of the model, used for the export filename and in the returned results. + sensor_label : str + Label identifying which sensor's data the model is being trained on. + x_train : array-like or pandas.DataFrame + The training input samples. + y_train : array-like + The target values for training. + x_test : array-like or pandas.DataFrame + The test input samples. + y_test : array-like + The target values for testing. + export : str, optional + Directory path where the trained model should be saved. If None, model won't be saved. + + Returns + ------- + dict + Dictionary containing: + - 'model': model_name (str) + - 'sensor': sensor_label (str) + - 'accuracy': accuracy percentage (float) + + Example + ------- + >>> from sklearn.svm import SVC + >>> from sklearn.model_selection import train_test_split + >>> X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2) + >>> result = train_and_evaluate_model( + ... SVC(), + ... "SVM", + ... "sensor1", + ... X_train, + ... y_train, + ... X_test, + ... y_test, + ... export="models/sensor1" + ... ) + >>> print(f"Model accuracy: {result['accuracy']:.2f}%") + """ + from sklearn.metrics import accuracy_score + + result = {"model": model_name, "sensor": sensor_label, "success": False} + + try: + # Train the model + model.fit(x_train, y_train) + + try: + y_pred = model.predict(x_test) + except Exception as e: + result["error"] = f"Prediction error: {str(e)}" + return result + + # Calculate accuracy + try: + accuracy = accuracy_score(y_test, y_pred) * 100 + result["accuracy"] = accuracy + except Exception as e: + result["error"] = f"Accuracy calculation error: {str(e)}" + return result + + # Export model if requested + if export: + try: + import joblib + + full_path = os.path.join(export, f"{model_name}.joblib") + os.makedirs(os.path.dirname(full_path), exist_ok=True) + joblib.dump(model, full_path) + print(f"Model saved to {full_path}") + except Exception as e: + print(f"Warning: Failed to export model to {export}: {str(e)}") + result["export_error"] = str(e) + # Continue despite export error + + result["success"] = True + return result + + except Exception as e: + result["error"] = f"Training error: {str(e)}" + return result diff --git a/code/src/process_stft.py b/code/src/process_stft.py index 1de44b4..fc8c28b 100644 --- a/code/src/process_stft.py +++ b/code/src/process_stft.py @@ -6,7 +6,7 @@ import glob import multiprocessing # Added import for multiprocessing # Define the base directory where DAMAGE_X folders are located -damage_base_path = 'D:/thesis/data/converted/raw' +damage_base_path = 'D:/thesis/data/converted/raw_B' # Define output directories for each sensor output_dirs = { @@ -105,11 +105,13 @@ def process_damage_case(damage_num): ) # Save the aggregated STFT to CSV - df_aggregated.to_csv(output_file, index=False) + with open(output_file, 'w') as file: + file.write('sep=,\n') + df_aggregated.to_csv(output_file, index=False) print(f"Saved aggregated STFT for Sensor {sensor_num}, Damage {damage_num} to {output_file}") else: print(f"No STFT data aggregated for Sensor {sensor_num}, Damage {damage_num}.") if __name__ == "__main__": # Added main guard for multiprocessing with multiprocessing.Pool() as pool: - pool.map(process_damage_case, range(1, num_damage_cases + 1)) + pool.map(process_damage_case, range(0, num_damage_cases + 1)) diff --git a/data/QUGS/convert.py b/data/QUGS/convert.py index 95f1073..c86e34c 100644 --- a/data/QUGS/convert.py +++ b/data/QUGS/convert.py @@ -26,73 +26,109 @@ class DamageFilesIndices(TypedDict): files: List[str] -def generate_damage_files_index(**kwargs) -> DamageFilesIndices: - prefix: str = kwargs.get("prefix", "zzzAD") - extension: str = kwargs.get("extension", ".TXT") - num_damage: int = kwargs.get("num_damage") - file_index_start: int = kwargs.get("file_index_start") - col: int = kwargs.get("col") - base_path: str = kwargs.get("base_path") +def generate_df_tuples(total_dfs=30, group_size=5, prefix="zzzAD", ext="TXT", first_col_start=1, last_col_offset=25, + special_groups=None, group=True): + """ + Generate a structured list of tuples containing DataFrame references and column indices. + + Parameters: + ----------- + total_dfs : int, default 30 + Total number of DataFrames to include in the tuples + group_size : int, default 5 + Number of DataFrames in each group (determines the pattern repeat) + prefix : str, default "df" + Prefix for DataFrame variable names + first_col_start : int, default 1 + Starting value for the first column index (1-indexed) + last_col_offset : int, default 25 + Offset to add to first_col_start to get the last column index + special_groups : list of dict, optional + List of special groups to insert, each dict should contain: + - 'df_name': The DataFrame name to use for all tuples in this group + - 'position': Where to insert this group (0 for beginning) + - 'size': Size of this group (default: same as group_size) + + Returns: + -------- + list + List of tuples, where each tuple contains (df_name, [first_col, last_col]) + """ + tuples = [] + # Add regular groups + for i in range(1, total_dfs + 1): + # for _ in range(group_size): # group tuple + # temporary list to hold tuples for this group + # list = [] + # Calculate the position within the group (1 to group_size) + position_in_group = ((i - 1) % group_size) + 1 + + # Calculate column indices based on position in group + first_col = first_col_start + position_in_group - 1 + last_col = first_col + last_col_offset + + # Create the tuple with DataFrame reference and column indices + df_name = f"{prefix}{i}.{ext}" + tuples.append((df_name, [first_col, last_col])) - damage_scenarios = {} - a = file_index_start - b = col + 1 - for i in range(1, num_damage + 1): - damage_scenarios[i] = range(a, b) - a += col - b += col + if group: + # Group tuples into sublists of group_size + grouped_tuples = [] + for i in range(0, len(tuples), group_size): + grouped_tuples.append(tuples[i:i + group_size]) + tuples = grouped_tuples + # tuples.append(list) + # Add special groups at specified positions (other than beginning) + if special_groups: + for group in special_groups: + position = group.get('position', 0) # default value is 0 if not specified + df_name = group['df_name'] + size = group.get('size', group_size) + + # Create the special group tuples + special_tuples = [] + for i in range(size): + first_col = first_col_start + i + last_col = first_col + last_col_offset + special_tuples.append((df_name, [first_col, last_col])) + + tuples.insert(position, special_tuples) + + + return tuples - # return damage_scenarios - - x = {} - for damage, files in damage_scenarios.items(): - x[damage] = [] # Initialize each key with an empty list - for i, file_index in enumerate(files, start=1): - if base_path: - x[damage].append( - os.path.normpath( - os.path.join(base_path, f"{prefix}{file_index}{extension}") - ) - ) - # if not os.path.exists(file_path): - # print(Fore.RED + f"File {file_path} does not exist.") - # continue - else: - x[damage].append(f"{prefix}{file_index}{extension}") - return x # file_path = os.path.join(base_path, f"zzz{prefix}D{file_index}.TXT") - # df = pd.read_csv( file_path, sep="\t", skiprows=10) # Read with explicit column names + # df = pd.read_csv(file_path, sep="\t", skiprows=10) # Read with explicit column names class DataProcessor: - def __init__(self, file_index: DamageFilesIndices, cache_path: str = None): + def __init__(self, file_index, cache_path: str = None, base_path: str = None, include_time: bool = False): self.file_index = file_index + self.base_path = base_path + self.include_time = include_time if cache_path: self.data = load(cache_path) else: - self.data = self._load_all_data() - - def _extract_column_names(self, file_path: str) -> List[str]: - """ - Extracts column names from the header of the given file. - Assumes the 6th line contains column names. - - :param file_path: Path to the data file. - :return: List of column names. - """ - with open(file_path, "r") as f: - header_lines = [next(f) for _ in range(12)] - - # Extract column names from the 6th line - channel_line = header_lines[10].strip() - tokens = re.findall(r'"([^"]+)"', channel_line) - if not channel_line.startswith('"'): - first_token = channel_line.split()[0] - tokens = [first_token] + tokens - - return tokens # Prepend 'Time' column if applicable + self.data = self.load_data() + def load_data(self): + for idxs, group in enumerate(self.file_index): + for idx, tuple in enumerate(group): + file_path = os.path.join(self.base_path, tuple[0]) # ('zzzAD1.TXT') + if self.include_time: + col_indices = [0] + tuple[1] # [1, 26] + [0] -> [0, 1, 26] + else: + col_indices = tuple[1] # [1, 26] + try: + # Read the CSV file + df = pd.read_csv(file_path, delim_whitespace=True, skiprows=10, header=0, memory_map=True) + self.file_index[idxs][idx] = df.iloc[:, col_indices].copy() # Extract the specified columns + + print(f"Processed {file_path}, extracted columns: {col_indices}") + + except Exception as e: + print(f"Error processing {file_path}: {str(e)}") def _load_dataframe(self, file_path: str) -> OriginalSingleDamageScenario: """ Loads a single data file into a pandas DataFrame. @@ -100,11 +136,7 @@ class DataProcessor: :param file_path: Path to the data file. :return: DataFrame containing the numerical data. """ - col_names = self._extract_column_names(file_path) - df = pd.read_csv( - file_path, delim_whitespace=True, skiprows=11, header=None, memory_map=True - ) - df.columns = col_names + df = pd.read_csv(file_path, delim_whitespace=True, skiprows=10, header=0, memory_map=True, nrows=1) return df def _load_all_data(self) -> GroupDataset: @@ -115,7 +147,11 @@ class DataProcessor: """ data = [] # Find the maximum group index to determine the list size - max_group_idx = max(self.file_index.keys()) if self.file_index else 0 + max_group_idx = len(self.file_index) if self.file_index else 0 + + # Handle case when file_index is empty + if max_group_idx == 0: + raise ValueError("No file index provided; file_index is empty.") # Initialize empty lists for _ in range(max_group_idx): @@ -123,10 +159,8 @@ class DataProcessor: # Fill the list with data for group_idx, file_list in self.file_index.items(): - # Adjust index to be 0-based - list_idx = group_idx - 1 - data[list_idx] = [self._load_dataframe(file) for file in file_list] - + group_idx -= 1 # adjust due to undamage file + data[group_idx] = [self._load_dataframe(file) for file in file_list] return data def get_group_data(self, group_idx: int) -> List[pd.DataFrame]: @@ -182,14 +216,14 @@ class DataProcessor: y = 0 for data_group in self.data: # len(data_group[i]) = 5 for j in data_group: # len(j[i]) = - c: VectorColumnIndex = [] # column vector c_{j} + c: VectorColumnIndex = [] x = 0 for _ in range(6): # TODO: range(6) should be dynamic and parameterized c.append(x + y) x += 5 vector_col_idx.append(c) y += 1 - return vector_col_idx + return vector_col_idx # TODO: refactor this so that it returns just from first data_group without using for loops through the self.data that seems unnecessary def create_vector_column(self, overwrite=True) -> List[List[List[pd.DataFrame]]]: """ @@ -197,26 +231,16 @@ class DataProcessor: :param overwrite: Overwrite the original data with vector column-based data. """ - idx = self._create_vector_column_index() - # if overwrite: - for i in range(len(self.data)): - for j in range(len(self.data[i])): - # Get the appropriate indices for slicing from idx - indices = idx[j] + idxs = self._create_vector_column_index() + for i, group in enumerate(self.data): + # add 1 to all indices to account for 'Time' being at position 0 + for j, df in enumerate(group): + idx = [_ + 1 for _ in idxs[j]] + # slice out the desired columns, copy into a fresh DataFrame, + # then overwrite self.data[i][j] with it + self.data[i][j] = df.iloc[:, idx].copy() - # Get the current DataFrame - df = self.data[i][j] - - # Keep the 'Time' column and select only specified 'Real' columns - # First, we add 1 to all indices to account for 'Time' being at position 0 - real_indices = [index + 1 for index in indices] - - # Create list with Time column index (0) and the adjusted Real indices - all_indices = [0] + real_indices - - # Apply the slicing - self.data[i][j] = df.iloc[:, all_indices] - # TODO: if !overwrite: + # TODO: if !overwrite: def create_limited_sensor_vector_column(self, overwrite=True): """ @@ -252,91 +276,79 @@ class DataProcessor: :param output_dir: Directory to save the CSV files. :param file_prefix: Prefix for the output filenames. """ - for group_idx, group in enumerate(self.data, start=1): + for group_idx, group in enumerate(self.file_index, start=0): group_folder = os.path.join(output_dir, f"{file_prefix}_{group_idx}") os.makedirs(group_folder, exist_ok=True) + for test_idx, df in enumerate(group, start=1): - # Ensure columns are named uniquely if duplicated - df = df.copy() - df.columns = ["Time", "Real_0", "Real_1"] # Rename + out1 = os.path.join(group_folder, f"{file_prefix}_{group_idx}_TEST{test_idx}_01.csv") + cols_to_export = [0, 1] if self.include_time else [1] + df.iloc[:, cols_to_export].to_csv(out1, index=False) - # Export first Real column - out1 = os.path.join( - group_folder, f"{file_prefix}_{group_idx}_TEST{test_idx}_01.csv" - ) - df[["Time", "Real_0"]].rename(columns={"Real_0": "Real"}).to_csv( - out1, index=False - ) + out2 = os.path.join(group_folder, f"{file_prefix}_{group_idx}_TEST{test_idx}_02.csv") + cols_to_export = [0, 2] if self.include_time else [2] + df.iloc[:, cols_to_export].to_csv(out2, index=False) - # Export last Real column - out2 = os.path.join( - group_folder, f"{file_prefix}_{group_idx}_TEST{test_idx}_02.csv" - ) - df[["Time", "Real_1"]].rename(columns={"Real_1": "Real"}).to_csv( - out2, index=False - ) +# def create_damage_files(base_path, output_base, prefix): +# # Initialize colorama +# init(autoreset=True) +# # Generate column labels based on expected duplication in input files +# columns = ["Real"] + [ +# f"Real.{i}" for i in range(1, 30) +# ] # Explicitly setting column names -def create_damage_files(base_path, output_base, prefix): - # Initialize colorama - init(autoreset=True) +# sensor_end_map = { +# 1: "Real.25", +# 2: "Real.26", +# 3: "Real.27", +# 4: "Real.28", +# 5: "Real.29", +# } - # Generate column labels based on expected duplication in input files - columns = ["Real"] + [ - f"Real.{i}" for i in range(1, 30) - ] # Explicitly setting column names +# # Define the damage scenarios and the corresponding original file indices +# damage_scenarios = { +# 1: range(1, 6), # Damage 1 files from zzzAD1.csv to zzzAD5.csv +# 2: range(6, 11), # Damage 2 files from zzzAD6.csv to zzzAD10.csv +# 3: range(11, 16), # Damage 3 files from zzzAD11.csv to zzzAD15.csvs +# 4: range(16, 21), # Damage 4 files from zzzAD16.csv to zzzAD20.csv +# 5: range(21, 26), # Damage 5 files from zzzAD21.csv to zzzAD25.csv +# 6: range(26, 31), # Damage 6 files from zzzAD26.csv to zzzAD30.csv +# } +# damage_pad = len(str(len(damage_scenarios))) +# test_pad = len(str(30)) - sensor_end_map = { - 1: "Real.25", - 2: "Real.26", - 3: "Real.27", - 4: "Real.28", - 5: "Real.29", - } +# for damage, files in damage_scenarios.items(): +# for i, file_index in enumerate(files, start=1): +# # Load original data file +# file_path = os.path.join(base_path, f"zzz{prefix}D{file_index}.TXT") +# df = pd.read_csv( +# file_path, sep="\t", skiprows=10 +# ) # Read with explicit column names - # Define the damage scenarios and the corresponding original file indices - damage_scenarios = { - 1: range(1, 6), # Damage 1 files from zzzAD1.csv to zzzAD5.csv - 2: range(6, 11), # Damage 2 files from zzzAD6.csv to zzzAD10.csv - 3: range(11, 16), # Damage 3 files from zzzAD11.csv to zzzAD15.csvs - 4: range(16, 21), # Damage 4 files from zzzAD16.csv to zzzAD20.csv - 5: range(21, 26), # Damage 5 files from zzzAD21.csv to zzzAD25.csv - 6: range(26, 31), # Damage 6 files from zzzAD26.csv to zzzAD30.csv - } - damage_pad = len(str(len(damage_scenarios))) - test_pad = len(str(30)) +# top_sensor = columns[i - 1] +# print(top_sensor, type(top_sensor)) +# output_file_1 = os.path.join( +# output_base, f"DAMAGE_{damage}", f"DAMAGE{damage}_TEST{i}_01.csv" +# ) +# print(f"Creating {output_file_1} from taking zzz{prefix}D{file_index}.TXT") +# print("Taking datetime column on index 0...") +# print(f"Taking `{top_sensor}`...") +# os.makedirs(os.path.dirname(output_file_1), exist_ok=True) +# df[["Time", top_sensor]].to_csv(output_file_1, index=False) +# print(Fore.GREEN + "Done") - for damage, files in damage_scenarios.items(): - for i, file_index in enumerate(files, start=1): - # Load original data file - file_path = os.path.join(base_path, f"zzz{prefix}D{file_index}.TXT") - df = pd.read_csv( - file_path, sep="\t", skiprows=10 - ) # Read with explicit column names - - top_sensor = columns[i - 1] - print(top_sensor, type(top_sensor)) - output_file_1 = os.path.join( - output_base, f"DAMAGE_{damage}", f"DAMAGE{damage}_TEST{i}_01.csv" - ) - print(f"Creating {output_file_1} from taking zzz{prefix}D{file_index}.TXT") - print("Taking datetime column on index 0...") - print(f"Taking `{top_sensor}`...") - os.makedirs(os.path.dirname(output_file_1), exist_ok=True) - df[["Time", top_sensor]].to_csv(output_file_1, index=False) - print(Fore.GREEN + "Done") - - bottom_sensor = sensor_end_map[i] - output_file_2 = os.path.join( - output_base, f"DAMAGE_{damage}", f"DAMAGE{damage}_TEST{i}_02.csv" - ) - print(f"Creating {output_file_2} from taking zzz{prefix}D{file_index}.TXT") - print("Taking datetime column on index 0...") - print(f"Taking `{bottom_sensor}`...") - os.makedirs(os.path.dirname(output_file_2), exist_ok=True) - df[["Time", bottom_sensor]].to_csv(output_file_2, index=False) - print(Fore.GREEN + "Done") - print("---") +# bottom_sensor = sensor_end_map[i] +# output_file_2 = os.path.join( +# output_base, f"DAMAGE_{damage}", f"DAMAGE{damage}_TEST{i}_02.csv" +# ) +# print(f"Creating {output_file_2} from taking zzz{prefix}D{file_index}.TXT") +# print("Taking datetime column on index 0...") +# print(f"Taking `{bottom_sensor}`...") +# os.makedirs(os.path.dirname(output_file_2), exist_ok=True) +# df[["Time", bottom_sensor]].to_csv(output_file_2, index=False) +# print(Fore.GREEN + "Done") +# print("---") def main(): diff --git a/data/QUGS/test.py b/data/QUGS/test.py index 12b39cf..058af7c 100644 --- a/data/QUGS/test.py +++ b/data/QUGS/test.py @@ -1,25 +1,45 @@ from convert import * from joblib import dump, load +# b = generate_damage_files_index( +# num_damage=6, +# file_index_start=1, +# col=5, +# base_path="D:/thesis/data/dataset_B", +# prefix="zzzBD", +# # undamage_file="zzzBU.TXT" +# ) +# Example: Generate tuples with a special group of df0 at the beginning +special_groups_A = [ + {'df_name': 'zzzAU.TXT', 'position': 0, 'size': 5} # Add at beginning +] + +special_groups_B = [ + {'df_name': 'zzzBU.TXT', 'position': 0, 'size': 5} # Add at beginning +] + +# Generate the tuples with the special group +# a = generate_df_tuples(special_groups=special_groups_A) +b = generate_df_tuples(special_groups=special_groups_B, prefix="zzzBD") + + # a = generate_damage_files_index( -# num_damage=6, file_index_start=1, col=5, base_path="D:/thesis/data/dataset_A" +# num_damage=6, +# file_index_start=1, +# col=5, +# base_path="D:/thesis/data/dataset_A", +# prefix="zzzAD", +# # undamage_file="zzzBU.TXT" # ) -b = generate_damage_files_index( - num_damage=6, - file_index_start=1, - col=5, - base_path="D:/thesis/data/dataset_B", - prefix="zzzBD", -) -# data_A = DataProcessor(file_index=a) -# # data.create_vector_column(overwrite=True) -# data_A.create_limited_sensor_vector_column(overwrite=True) +# data_A = DataProcessor(file_index=a, base_path="D:/thesis/data/dataset_A", include_time=True) +# data_A.create_vector_column(overwrite=True) +# # data_A.create_limited_sensor_vector_column(overwrite=True) # data_A.export_to_csv("D:/thesis/data/converted/raw") -data_B = DataProcessor(file_index=b) -# data.create_vector_column(overwrite=True) -data_B.create_limited_sensor_vector_column(overwrite=True) +data_B = DataProcessor(file_index=b, base_path="D:/thesis/data/dataset_B", include_time=True) +# data_B.create_vector_column(overwrite=True) +# # data_B.create_limited_sensor_vector_column(overwrite=True) data_B.export_to_csv("D:/thesis/data/converted/raw_B") # a = load("D:/cache.joblib") # breakpoint() diff --git a/latex/figures/A4 - 4.png b/latex/figures/A4 - 4.png new file mode 100644 index 0000000..45efba2 Binary files /dev/null and b/latex/figures/A4 - 4.png differ