thesis/code/notebooks/03_feature_extraction.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Mockup Data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To test the `FeatureExtractor` class from the `time_domain_features.py` script with a simple mockup dataset of 5 to 10 data points directly in a Python notebook."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Importing Modules\n",
    "\n",
    "Use relative imports or modify the path to include the directory where the module is stored. In this example, we’ll simulate the relative import setup."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create Mockup Data\n",
    "\n",
    "Create a small dataset with 5 to 10 data points. Simulate importing the `FeatureExtractor` from its relative path in the notebooks directory."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create mockup data\n",
    "np.random.seed(42)  # For reproducibility\n",
    "mock_data = np.random.randn(10)  # Generate 10 random data points\n",
    "\n",
    "# Convert to DataFrame (simulating processed data input)\n",
    "mock_df = pd.DataFrame(mock_data, columns=['SampleData'])\n",
    "mock_df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Visualize Data Points"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# Plotting the data points\n",
    "plt.figure(figsize=(8, 6))\n",
    "plt.plot(mock_df.index, mock_df['SampleData'], marker='o', color='blue', label='Data Points')\n",
    "plt.title('Scatter Plot of Data Points')\n",
    "plt.xlabel('Time')\n",
    "plt.ylabel('SampleData')\n",
    "plt.legend()\n",
    "plt.grid(True)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Print Time-domain Features (Single Mockup Data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import sys\n",
    "import os\n",
    "# Assuming the src directory is one level up from the notebooks directory\n",
    "sys.path.append('../src/features')\n",
    "from time_domain_features import FeatureExtractor\n",
    "\n",
    "\n",
    "# Extract features\n",
    "extracted = FeatureExtractor(mock_df['SampleData'])\n",
    "\n",
    "# Format with pandas DataFramw\n",
    "features = pd.DataFrame(extracted.features, index=[0])\n",
    "features\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Print Time-domain Features (Multiple CSV Mockup Data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Importing modules"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import sys\n",
    "import os\n",
    "import re\n",
    "# Assuming the src directory is one level up from the notebooks directory\n",
    "sys.path.append('../src/features')\n",
    "from time_domain_features import ExtractTimeFeatures # use wrapper function instead of class for easy use\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### The function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define a function to extract numbers from a filename that later used as labels features\n",
    "def extract_numbers(filename):\n",
    "    # Find all occurrences of one or more digits in the filename\n",
    "    numbers = re.findall(r'\\d+', filename)\n",
    "    # Convert the list of number strings to integers\n",
    "    numbers = [int(num) for num in numbers]\n",
    "    # Convert to a tuple and return\n",
    "    return numbers\n",
    "\n",
    "def build_features(input_dir:str, sensor:int=None):\n",
    "    all_features = []\n",
    "    for nth_damage in os.listdir(input_dir):\n",
    "        nth_damage_path = os.path.join(input_dir, nth_damage)\n",
    "        if os.path.isdir(nth_damage_path):\n",
    "            for nth_test in os.listdir(nth_damage_path):\n",
    "                nth_test_path = os.path.join(nth_damage_path, nth_test)\n",
    "                if sensor is not None:\n",
    "                    # Check if the file has the specified sensor suffix\n",
    "                    if not nth_test.endswith(f'_{sensor}.csv'):\n",
    "                        continue\n",
    "                features = ExtractTimeFeatures(nth_test_path)  # return the one csv file feature in dictionary {}\n",
    "                features['label'] = extract_numbers(nth_test)[0]  # add labels to the dictionary\n",
    "                features['filename'] = nth_test  # add filename to the dictionary\n",
    "                all_features.append(features)\n",
    "\n",
    "    # Create a DataFrame from the list of dictionaries\n",
    "    df = pd.DataFrame(all_features)\n",
    "    return df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Execute the automation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_dir = \"../../data/raw\"\n",
    "# Extract features\n",
    "df1 = build_features(data_dir, sensor=1)\n",
    "df2 = build_features(data_dir, sensor=2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Sensor 1 Extracted Features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df1.head(5)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Sensor 2 Extracted Features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df2.head(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Perform division"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Separate the label column\n",
    "label_column = df1.iloc[:, -2]\n",
    "\n",
    "# Perform the relative value by operate division on all the features\n",
    "df_relative = df2.iloc[:, :-2] / df1.iloc[:, :-2]\n",
    "\n",
    "# Add the label column back to the resulting DataFrame\n",
    "df_relative['label'] = label_column\n",
    "\n",
    "# Append a string to all column names\n",
    "suffix = '_rel'\n",
    "df_relative.columns = [col + suffix if col != 'label' else col for col in df_relative.columns]\n",
    "\n",
    "# Display the first 5 rows of the resulting DataFrame\n",
    "df_relative\n",
    "df_relative.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Subsetting DataFrame to see the pair plots due to many features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# Assuming your DataFrame is named 'df'\n",
    "\n",
    "# Subsetting the DataFrame to include only the first 3 columns and the label\n",
    "subset_df = df_relative[['Mean_rel', 'Max_rel', 'Peak (Pm)_rel', 'label']]\n",
    "\n",
    "# Plotting the pairplot\n",
    "g = sns.pairplot(subset_df, hue='label', diag_kind='kde')\n",
    "\n",
    "# Adjusting the axis limits\n",
    "for ax in g.axes.flatten():\n",
    "    ax.set_xlim(-10, 10)  # Adjust these limits based on your data\n",
    "    ax.set_ylim(-10, 10)  # Adjust these limits based on your data\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Standard Scaler"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "# Assuming 'subset_df' is your DataFrame\n",
    "features = subset_df.columns[:-1]  # Select all columns except the label\n",
    "scaler = StandardScaler()\n",
    "subset_df[features] = scaler.fit_transform(subset_df[features])\n",
    "\n",
    "# Plotting the pairplot\n",
    "sns.pairplot(subset_df, hue='label', diag_kind='kde')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Min-Max Scaler"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import MinMaxScaler\n",
    "\n",
    "# Assuming 'subset_df' is your DataFrame\n",
    "features = subset_df.columns[:-1]  # Select all columns except the label\n",
    "scaler = MinMaxScaler()\n",
    "subset_df[features] = scaler.fit_transform(subset_df[features])\n",
    "\n",
    "# Plotting the pairplot\n",
    "sns.pairplot(subset_df, hue='label', diag_kind='kde')\n",
    "plt.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}