(4) Apply augmentation on train set only

36920ef7 · Almouhannad Hafez · b355fbd8 · 36920ef7 · 36920ef7 · 36920ef7
Commit 36920ef7 authored Nov 22, 2024 by Almouhannad Hafez
3 changed files
--- a/4/4.Data_augmentation.ipynb
+++ b/4/4.Data_augmentation.ipynb
@@ -22,7 +22,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -44,7 +44,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -68,7 +68,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -102,7 +102,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
@@ -140,7 +140,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -180,14 +180,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Done 10%\n",
+      " Done 10%\n",
      "Done 20%\n",
      "Done 30%\n",
      "Done 40%\n",
@@ -206,7 +206,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -223,112 +223,59 @@
    "# ***Preprocess augmented dataset***"
   ]
  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from sklearn.utils import shuffle\n",
-    "from sklearn.model_selection import train_test_split\n",
-    "import contractions\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## ***1- Refactor dataset columns so we have text and label only***\n"
-   ]
-  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
-     "data": {
+     "name": "stdout",
-      "text/html": [
+     "output_type": "stream",
-       "<div>\n",
+     "text": [
-       "<style scoped>\n",
+      "        Id      label                                               text\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
+      "0        0  Psoriasis  I have been experiencing a skin rash on my arm...\n",
-       "        vertical-align: middle;\n",
+      "1        1  Psoriasis  My skin has been peeling, especially on my kne...\n",
-       "    }\n",
+      "2        2  Psoriasis  I have been experiencing joint pain in my fing...\n",
+      "3        3  Psoriasis  There is a silver like dusting on my skin, esp...\n",
+      "4        4  Psoriasis  My nails have small dents or pits in them, and...\n",
+      "...    ...        ...                                                ...\n",
+      "1195  1195   diabetes  I'm shaking and trembling all over. I've lost ...\n",
+      "1196  1196   diabetes  Particularly in the crevices of my skin, I hav...\n",
+      "1197  1197   diabetes  I regularly experience these intense urges and...\n",
+      "1198  1198   diabetes  I have trouble breathing, especially outside. ...\n",
+      "1199  1199   diabetes  I constantly sneeze and have a dry cough. My i...\n",
      "\n",
-       "    .dataframe tbody tr th {\n",
+      "[1200 rows x 3 columns]\n",
-       "        vertical-align: top;\n",
+      "      Unnamed: 0      label                                               text\n",
-       "    }\n",
+      "0              0  Psoriasis  I have been experiencing a skin rash on my arm...\n",
+      "1              1  Psoriasis  My skin has been peeling, especially on my kne...\n",
+      "2              2  Psoriasis  I have been experiencing joint pain in my fing...\n",
+      "3              3  Psoriasis  There is a silver like dusting on my skin, esp...\n",
+      "4              4  Psoriasis  My nails have small dents or pits in them, and...\n",
+      "...          ...        ...                                                ...\n",
+      "1195         295   diabetes  I'm shaking and trembling all over. I've lost ...\n",
+      "1196         296   diabetes  Particularly in the crevices of my skin, I hav...\n",
+      "1197         297   diabetes  I regularly experience these intense urges and...\n",
+      "1198         298   diabetes  I have trouble breathing, especially outside. ...\n",
+      "1199         299   diabetes  I constantly sneeze and have a dry cough. My i...\n",
      "\n",
-       "    .dataframe thead th {\n",
+      "[1200 rows x 3 columns]\n"
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>text</th>\n",
-       "      <th>label</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>I have been experiencing a skin rash on my arm...</td>\n",
-       "      <td>Psoriasis</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>My skin has been peeling, especially on my kne...</td>\n",
-       "      <td>Psoriasis</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>I have been experiencing joint pain in my fing...</td>\n",
-       "      <td>Psoriasis</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>There is a silver like dusting on my skin, esp...</td>\n",
-       "      <td>Psoriasis</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>My nails have small dents or pits in them, and...</td>\n",
-       "      <td>Psoriasis</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                                text      label\n",
-       "0  I have been experiencing a skin rash on my arm...  Psoriasis\n",
-       "1  My skin has been peeling, especially on my kne...  Psoriasis\n",
-       "2  I have been experiencing joint pain in my fing...  Psoriasis\n",
-       "3  There is a silver like dusting on my skin, esp...  Psoriasis\n",
-       "4  My nails have small dents or pits in them, and...  Psoriasis"
     ]
-     },
-     "execution_count": 9,
-     "metadata": {},
-     "output_type": "execute_result"
    }
   ],
   "source": [
-    "df = augmented_df\n",
+    "from sklearn.utils import shuffle\n",
-    "\n",
+    "from sklearn.model_selection import train_test_split\n",
-    "df = df.drop(columns=['Id'])\n",
+    "original_data = augmented_df.head(1200)\n",
-    "df = df[['text', 'label']]\n",
+    "print(original_data)\n",
-    "\n",
+    "print(df)"
-    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "## ***2- Check for duplicates***"
+    "## ***1. Split ORIGINAL DATASET into train and test sets***\n"
   ]
  },
  {
@@ -340,59 +287,37 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Number of duplicates found: 59\n",
+      "Train set shape: (720, 3)\n",
-      "Shape of after removing duplicates: (7141, 2)\n"
+      "Test set shaape: (480, 3)\n"
     ]
    }
   ],
   "source": [
-    "duplicates = df[df.duplicated()]\n",
+    "train_df, test_df = train_test_split(original_data, test_size=0.4, random_state=42)\n",
-    "print(f\"Number of duplicates found: {len(duplicates)}\")\n",
+    "print(f\"Train set shape: {train_df.shape}\")\n",
-    "\n",
+    "print(f\"Test set shaape: {test_df.shape}\")"
-    "# Remove duplicates\n",
-    "df = df.drop_duplicates()\n",
-    "print(f\"Shape of after removing duplicates: {df.shape}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "## ***3- Check for nulls***\n"
+    "## ***2. Add augmented texts to TRAIN SET only***"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
-   "outputs": [
+   "outputs": [],
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Count of null values in each column:\n",
-      "text     0\n",
-      "label    0\n",
-      "dtype: int64\n",
-      "Shape of the DataFrame after removing null values: (7141, 2)\n"
-     ]
-    }
-   ],
-   "source": [
-    "null_values = df.isnull().sum()\n",
-    "print(\"Count of null values in each column:\")\n",
-    "print(null_values)\n",
-    "\n",
-    "# Remove rows with null values\n",
-    "df = df.dropna()\n",
-    "print(f\"Shape of the DataFrame after removing null values: {df.shape}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
   "source": [
-    "## ***4- Shuffle dataset***\n"
+    "def get_augmented_texts_ids(original_id):\n",
+    "    start = 1200\n",
+    "    shift = 5 * original_id\n",
+    "    augmented_texts_ids = []\n",
+    "    for i in range(5):\n",
+    "        augmented_texts_ids.append(start + shift + i)\n",
+    "    return augmented_texts_ids"
   ]
  },
  {
@@ -401,85 +326,33 @@
   "metadata": {},
   "outputs": [
    {
-     "data": {
+     "name": "stdout",
-      "text/html": [
+     "output_type": "stream",
-       "<div>\n",
+     "text": [
-       "<style scoped>\n",
+      "New train_set shape: (4320, 3)\n"
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>text</th>\n",
-       "      <th>label</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>I'm experiencing persistent symptoms of a cold...</td>\n",
-       "      <td>Common Cold</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>I'm experiencing intense symptoms including se...</td>\n",
-       "      <td>Jaundice</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>I'm experiencing a strong, unpleasant taste in...</td>\n",
-       "      <td>peptic ulcer disease</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>I have been experiencing digestive issues, inc...</td>\n",
-       "      <td>Migraine</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>I've noticed that the veins on my calves have ...</td>\n",
-       "      <td>Varicose Veins</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                                text                 label\n",
-       "0  I'm experiencing persistent symptoms of a cold...           Common Cold\n",
-       "1  I'm experiencing intense symptoms including se...              Jaundice\n",
-       "2  I'm experiencing a strong, unpleasant taste in...  peptic ulcer disease\n",
-       "3  I have been experiencing digestive issues, inc...              Migraine\n",
-       "4  I've noticed that the veins on my calves have ...        Varicose Veins"
     ]
-     },
-     "execution_count": 12,
-     "metadata": {},
-     "output_type": "execute_result"
    }
   ],
   "source": [
-    "df = shuffle(df, random_state=42).reset_index(drop=True)\n",
+    "original_ids = train_df[\"Id\"]\n",
+    "dataframes_to_concat = []\n",
+    "\n",
+    "for id in original_ids:\n",
+    "    for augmented_id in get_augmented_texts_ids(id):\n",
+    "        augmented_rows = augmented_df[augmented_df[\"Id\"] == augmented_id]\n",
+    "        dataframes_to_concat.append(augmented_rows)\n",
    "\n",
-    "df.head()"
+    "augmented_data = pd.concat(dataframes_to_concat, ignore_index=True)\n",
+    "train_df = pd.concat([train_df, augmented_data], ignore_index=True)\n",
+    "\n",
+    "print(f\"New train_set shape: {train_df.shape}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "## ***5- Count unique texts for each label***\n"
+    "## ***3- Count unique texts for each label***"
   ]
  },
  {
@@ -491,248 +364,103 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Count of unique text values for each label\n",
+      "Count of unique text values for each label in train set\n",
      "label\n",
-      "Jaundice                           288\n",
+      "Fungal infection                   144\n",
-      "Dimorphic Hemorrhoids              291\n",
+      "Varicose Veins                     150\n",
-      "Malaria                            292\n",
+      "Migraine                           155\n",
-      "Migraine                           296\n",
+      "urinary tract infection            162\n",
-      "Acne                               296\n",
+      "Impetigo                           162\n",
-      "Arthritis                          296\n",
+      "Jaundice                           164\n",
-      "Pneumonia                          297\n",
+      "Dengue                             166\n",
-      "gastroesophageal reflux disease    297\n",
+      "Pneumonia                          168\n",
-      "Dengue                             298\n",
+      "Acne                               172\n",
-      "Chicken pox                        298\n",
+      "Cervical spondylosis               173\n",
-      "drug reaction                      299\n",
+      "Common Cold                        174\n",
-      "diabetes                           299\n",
+      "gastroesophageal reflux disease    174\n",
-      "Typhoid                            299\n",
+      "Arthritis                          179\n",
-      "Common Cold                        299\n",
+      "Dimorphic Hemorrhoids              184\n",
-      "Impetigo                           299\n",
+      "Bronchial Asthma                   185\n",
-      "Hypertension                       299\n",
+      "Hypertension                       191\n",
-      "Cervical spondylosis               299\n",
+      "peptic ulcer disease               192\n",
-      "Bronchial Asthma                   299\n",
+      "Typhoid                            192\n",
-      "peptic ulcer disease               300\n",
+      "Malaria                            194\n",
-      "Psoriasis                          300\n",
+      "diabetes                           197\n",
-      "Varicose Veins                     300\n",
+      "Psoriasis                          198\n",
-      "allergy                            300\n",
+      "drug reaction                      204\n",
-      "Fungal infection                   300\n",
+      "Chicken pox                        210\n",
-      "urinary tract infection            300\n",
+      "allergy                            210\n",
      "Name: text, dtype: int64\n"
     ]
    }
   ],
   "source": [
-    "unique_text_counts = df.groupby('label')['text'].nunique()\n",
+    "# Train set\n",
+    "unique_text_counts = train_df.groupby('label')['text'].nunique()\n",
    "unique_text_counts = unique_text_counts.sort_values()\n",
    "\n",
-    "print(\"Count of unique text values for each label\")\n",
+    "print(\"Count of unique text values for each label in train set\")\n",
    "print(unique_text_counts)"
   ]
  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## ***6- Convert into lowercase***\n"
-   ]
-  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
-     "data": {
+     "name": "stdout",
-      "text/html": [
+     "output_type": "stream",
-       "<div>\n",
+     "text": [
-       "<style scoped>\n",
+      "Count of unique text values for each label in test set\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
+      "label\n",
-       "        vertical-align: middle;\n",
+      "Chicken pox                        15\n",
-       "    }\n",
+      "allergy                            15\n",
-       "\n",
+      "drug reaction                      16\n",
-       "    .dataframe tbody tr th {\n",
+      "diabetes                           17\n",
-       "        vertical-align: top;\n",
+      "Dimorphic Hemorrhoids              17\n",
-       "    }\n",
+      "Malaria                            17\n",
-       "\n",
+      "Psoriasis                          17\n",
-       "    .dataframe thead th {\n",
+      "Hypertension                       18\n",
-       "        text-align: right;\n",
+      "Typhoid                            18\n",
-       "    }\n",
+      "peptic ulcer disease               18\n",
-       "</style>\n",
+      "Arthritis                          19\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
+      "Bronchial Asthma                   19\n",
-       "  <thead>\n",
+      "Jaundice                           20\n",
-       "    <tr style=\"text-align: right;\">\n",
+      "gastroesophageal reflux disease    21\n",
-       "      <th></th>\n",
+      "Acne                               21\n",
-       "      <th>text</th>\n",
+      "Common Cold                        21\n",
-       "      <th>label</th>\n",
+      "Cervical spondylosis               21\n",
-       "    </tr>\n",
+      "Pneumonia                          21\n",
-       "  </thead>\n",
+      "Dengue                             22\n",
-       "  <tbody>\n",
+      "Migraine                           23\n",
-       "    <tr>\n",
+      "Impetigo                           23\n",
-       "      <th>0</th>\n",
+      "urinary tract infection            23\n",
-       "      <td>i'm experiencing persistent symptoms of a cold...</td>\n",
+      "Varicose Veins                     25\n",
-       "      <td>common cold</td>\n",
+      "Fungal infection                   26\n",
-       "    </tr>\n",
+      "Name: text, dtype: int64\n"
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>i'm experiencing intense symptoms including se...</td>\n",
-       "      <td>jaundice</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>i'm experiencing a strong, unpleasant taste in...</td>\n",
-       "      <td>peptic ulcer disease</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>i have been experiencing digestive issues, inc...</td>\n",
-       "      <td>migraine</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>i've noticed that the veins on my calves have ...</td>\n",
-       "      <td>varicose veins</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                                text                 label\n",
-       "0  i'm experiencing persistent symptoms of a cold...           common cold\n",
-       "1  i'm experiencing intense symptoms including se...              jaundice\n",
-       "2  i'm experiencing a strong, unpleasant taste in...  peptic ulcer disease\n",
-       "3  i have been experiencing digestive issues, inc...              migraine\n",
-       "4  i've noticed that the veins on my calves have ...        varicose veins"
     ]
-     },
-     "execution_count": 14,
-     "metadata": {},
-     "output_type": "execute_result"
    }
   ],
   "source": [
-    "df['text'] = df['text'].str.lower()\n",
+    "# Test set\n",
-    "df['label'] = df['label'].str.lower()\n",
+    "unique_text_counts = test_df.groupby('label')['text'].nunique()\n",
-    "df.head()"
+    "unique_text_counts = unique_text_counts.sort_values()\n",
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## ***7- Expand contractions***\n",
-    "**i.e. I'm => I am**"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
    "\n",
-       "    .dataframe thead th {\n",
+    "print(\"Count of unique text values for each label in test set\")\n",
-       "        text-align: right;\n",
+    "print(unique_text_counts)"
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>text</th>\n",
-       "      <th>label</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>i am experiencing persistent symptoms of a col...</td>\n",
-       "      <td>common cold</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>i am experiencing intense symptoms including s...</td>\n",
-       "      <td>jaundice</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>i am experiencing a strong, unpleasant taste i...</td>\n",
-       "      <td>peptic ulcer disease</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>i have been experiencing digestive issues, inc...</td>\n",
-       "      <td>migraine</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>i have noticed that the veins on my calves hav...</td>\n",
-       "      <td>varicose veins</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                                text                 label\n",
-       "0  i am experiencing persistent symptoms of a col...           common cold\n",
-       "1  i am experiencing intense symptoms including s...              jaundice\n",
-       "2  i am experiencing a strong, unpleasant taste i...  peptic ulcer disease\n",
-       "3  i have been experiencing digestive issues, inc...              migraine\n",
-       "4  i have noticed that the veins on my calves hav...        varicose veins"
-      ]
-     },
-     "execution_count": 15,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df['text'] = df['text'].apply(contractions.fix)\n",
-    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "## ***8- Split into Train-Test and save in .csv files***\n"
+    "## ***4- Save in .csv files***"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Train set shape: (4284, 2)\n",
-      "Test set shaape: (2857, 2)\n"
-     ]
-    }
-   ],
-   "source": [
-    "train_df, test_df = train_test_split(df, test_size=0.4, random_state=42)\n",
-    "print(f\"Train set shape: {train_df.shape}\")\n",
-    "print(f\"Test set shaape: {test_df.shape}\")"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [

--- a/data/augmented_Preprocessed_Symptom2Disease_Test.csv
+++ b/data/augmented_Preprocessed_Symptom2Disease_Test.csv
--- a/data/augmented_Preprocessed_Symptom2Disease_Train.csv
+++ b/data/augmented_Preprocessed_Symptom2Disease_Train.csv