Dataset Augmentation

a3277a8d · Almouhannad Hafez · af95466f · a3277a8d · a3277a8d · a3277a8d
Commit a3277a8d authored Nov 15, 2024 by Almouhannad Hafez
5 changed files
--- a/4.data_augmentation.ipynb
+++ b/4.data_augmentation.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# ***Setup***"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "from ollama import Client\n",
+    "\n",
+    "import re\n",
+    "\n",
+    "from constants import CONSTANTS"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## ***Read dataset***"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv(CONSTANTS.DATASET_PATH)\n",
+    "assert df.shape == (1200, 3), f\"Expected shape (1200, 3), but got {df.shape}\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## ***Some helper functions***"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### ***Send prompts to `llama3` API***"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "client = Client(host='http://172.25.1.141:11434')\n",
+    "\n",
+    "def clean_generated_text(text):\n",
+    "    cleaned_text = re.sub(r'[^\\w\\s,.\\']', '', text)  # Remove special characters\n",
+    "    cleaned_text = cleaned_text.replace('\\n', ' ')  # replace newlines with spaces\n",
+    "    cleaned_text = ' '.join(cleaned_text.split())  # eemove extra spaces\n",
+    "    return cleaned_text.strip()  # Trim leading and trailing spaces\n",
+    "\n",
+    "\n",
+    "def send_prompt(prompt):\n",
+    "    response = client.chat(model='llama3', messages=[\n",
+    "        {\n",
+    "            'role': 'user',\n",
+    "            'content': prompt + '\\nSend the response only.',\n",
+    "        },\n",
+    "    ])\n",
+    "    \n",
+    "    # Extracting the content from the response\n",
+    "    return clean_generated_text(response['message']['content'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### ***Example usage***\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "I've noticed a persistent skin rash on my arms, legs, and torso over the past few weeks. The affected areas appear red, itchy, and are characterized by dry, scaling patches.\n",
+      "For the past few weeks, I've been dealing with a persistent skin rash that affects my arms, legs, and torso. The rash appears as red, itchy areas with dry, flaky patches.\n",
+      "For several weeks, I've been dealing with a persistent skin issue affecting my arms, legs, and torso. The rash appears as red, itchy areas covered in dry, flaky scales.\n",
+      "For several weeks, I've noticed a persistent skin issue affecting my arms, legs, and torso. The affected areas appear red, irritated, and feature dry, flaky scales.\n",
+      "For the past few weeks, I've been dealing with an uncomfortable skin issue that's affected my arms, legs, and torso. The rash has characterized by a reddish hue, intense itchiness, and the presence of dry, flaky scales.\n"
+     ]
+    }
+   ],
+   "source": [
+    "text = \"I have been experiencing a skin rash on my arms, legs, and torso for the past few weeks. It is red, itchy, and covered in dry, scaly patches.\"\n",
+    "for i in range(5):\n",
+    "  result_i = send_prompt(f'Rephrase the following text:\\n{text}')\n",
+    "  print(result_i)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### ***Augment dataset using `llm`***\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**`n_augmentations` is number of rephrased texts to generate for each row in the original dataset**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def augment_dataset(df, text_column=\"text\", label_column=\"label\", n_augmentations=5):\n",
+    "    augmented_texts = []\n",
+    "    augmented_labels = []\n",
+    "    total_rows = len(df)\n",
+    "    \n",
+    "    \n",
+    "    if total_rows < 10:\n",
+    "        progress_step = 1\n",
+    "    else:\n",
+    "        progress_step = total_rows // 10\n",
+    "\n",
+    "    for index, row in df.iterrows():\n",
+    "        text, label = row[text_column], row[label_column]\n",
+    "        for _ in range(n_augmentations):\n",
+    "            augmented_text = send_prompt(f'Rephrase the following text:\\n{text}')\n",
+    "            augmented_texts.append(augmented_text)\n",
+    "            augmented_labels.append(label)\n",
+    "        \n",
+    "        # Verbose section\n",
+    "        if (index + 1) % progress_step == 0:\n",
+    "            percentage_done = (index + 1) / total_rows * 100\n",
+    "            print(f\"Done {percentage_done:.0f}%\")\n",
+    "\n",
+    "    augmented_df = pd.DataFrame({text_column: augmented_texts, label_column: augmented_labels})\n",
+    "    return pd.concat([df, augmented_df], ignore_index=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# ***Augment dataset***"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Done 10%\n",
+      "Done 20%\n",
+      "Done 30%\n",
+      "Done 40%\n",
+      "Done 50%\n",
+      "Done 60%\n",
+      "Done 70%\n",
+      "Done 80%\n",
+      "Done 90%\n",
+      "Done 100%\n"
+     ]
+    }
+   ],
+   "source": [
+    "augmented_df = augment_dataset(df, n_augmentations=5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "augmented_df.to_csv(CONSTANTS.AUGMENTED_DATASET_PATH, index=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# ***Preprocess augmented dataset***"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.utils import shuffle\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "import contractions\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## ***1- Refactor dataset columns so we have text and label only***\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>text</th>\n",
+       "      <th>label</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>I have been experiencing a skin rash on my arm...</td>\n",
+       "      <td>Psoriasis</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>My skin has been peeling, especially on my kne...</td>\n",
+       "      <td>Psoriasis</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>I have been experiencing joint pain in my fing...</td>\n",
+       "      <td>Psoriasis</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>There is a silver like dusting on my skin, esp...</td>\n",
+       "      <td>Psoriasis</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>My nails have small dents or pits in them, and...</td>\n",
+       "      <td>Psoriasis</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                text      label\n",
+       "0  I have been experiencing a skin rash on my arm...  Psoriasis\n",
+       "1  My skin has been peeling, especially on my kne...  Psoriasis\n",
+       "2  I have been experiencing joint pain in my fing...  Psoriasis\n",
+       "3  There is a silver like dusting on my skin, esp...  Psoriasis\n",
+       "4  My nails have small dents or pits in them, and...  Psoriasis"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = augmented_df\n",
+    "\n",
+    "df = df.drop(columns=['Unnamed: 0'])\n",
+    "df = df[['text', 'label']]\n",
+    "\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## ***2- Check for duplicates***"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of duplicates found: 59\n",
+      "Shape of after removing duplicates: (7141, 2)\n"
+     ]
+    }
+   ],
+   "source": [
+    "duplicates = df[df.duplicated()]\n",
+    "print(f\"Number of duplicates found: {len(duplicates)}\")\n",
+    "\n",
+    "# Remove duplicates\n",
+    "df = df.drop_duplicates()\n",
+    "print(f\"Shape of after removing duplicates: {df.shape}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## ***3- Check for nulls***\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Count of null values in each column:\n",
+      "text     0\n",
+      "label    0\n",
+      "dtype: int64\n",
+      "Shape of the DataFrame after removing null values: (7141, 2)\n"
+     ]
+    }
+   ],
+   "source": [
+    "null_values = df.isnull().sum()\n",
+    "print(\"Count of null values in each column:\")\n",
+    "print(null_values)\n",
+    "\n",
+    "# Remove rows with null values\n",
+    "df = df.dropna()\n",
+    "print(f\"Shape of the DataFrame after removing null values: {df.shape}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## ***4- Shuffle dataset***\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>text</th>\n",
+       "      <th>label</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>I'm experiencing persistent symptoms of a cold...</td>\n",
+       "      <td>Common Cold</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>I'm experiencing intense symptoms including se...</td>\n",
+       "      <td>Jaundice</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>I'm experiencing a strong, unpleasant taste in...</td>\n",
+       "      <td>peptic ulcer disease</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>I have been experiencing digestive issues, inc...</td>\n",
+       "      <td>Migraine</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>I've noticed that the veins on my calves have ...</td>\n",
+       "      <td>Varicose Veins</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                text                 label\n",
+       "0  I'm experiencing persistent symptoms of a cold...           Common Cold\n",
+       "1  I'm experiencing intense symptoms including se...              Jaundice\n",
+       "2  I'm experiencing a strong, unpleasant taste in...  peptic ulcer disease\n",
+       "3  I have been experiencing digestive issues, inc...              Migraine\n",
+       "4  I've noticed that the veins on my calves have ...        Varicose Veins"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = shuffle(df, random_state=42).reset_index(drop=True)\n",
+    "\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## ***5- Count unique texts for each label***\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Count of unique text values for each label\n",
+      "label\n",
+      "Jaundice                           288\n",
+      "Dimorphic Hemorrhoids              291\n",
+      "Malaria                            292\n",
+      "Migraine                           296\n",
+      "Acne                               296\n",
+      "Arthritis                          296\n",
+      "Pneumonia                          297\n",
+      "gastroesophageal reflux disease    297\n",
+      "Dengue                             298\n",
+      "Chicken pox                        298\n",
+      "drug reaction                      299\n",
+      "diabetes                           299\n",
+      "Typhoid                            299\n",
+      "Common Cold                        299\n",
+      "Impetigo                           299\n",
+      "Hypertension                       299\n",
+      "Cervical spondylosis               299\n",
+      "Bronchial Asthma                   299\n",
+      "peptic ulcer disease               300\n",
+      "Psoriasis                          300\n",
+      "Varicose Veins                     300\n",
+      "allergy                            300\n",
+      "Fungal infection                   300\n",
+      "urinary tract infection            300\n",
+      "Name: text, dtype: int64\n"
+     ]
+    }
+   ],
+   "source": [
+    "unique_text_counts = df.groupby('label')['text'].nunique()\n",
+    "unique_text_counts = unique_text_counts.sort_values()\n",
+    "\n",
+    "print(\"Count of unique text values for each label\")\n",
+    "print(unique_text_counts)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## ***6- Convert into lowercase***\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>text</th>\n",
+       "      <th>label</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>i'm experiencing persistent symptoms of a cold...</td>\n",
+       "      <td>common cold</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>i'm experiencing intense symptoms including se...</td>\n",
+       "      <td>jaundice</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>i'm experiencing a strong, unpleasant taste in...</td>\n",
+       "      <td>peptic ulcer disease</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>i have been experiencing digestive issues, inc...</td>\n",
+       "      <td>migraine</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>i've noticed that the veins on my calves have ...</td>\n",
+       "      <td>varicose veins</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                text                 label\n",
+       "0  i'm experiencing persistent symptoms of a cold...           common cold\n",
+       "1  i'm experiencing intense symptoms including se...              jaundice\n",
+       "2  i'm experiencing a strong, unpleasant taste in...  peptic ulcer disease\n",
+       "3  i have been experiencing digestive issues, inc...              migraine\n",
+       "4  i've noticed that the veins on my calves have ...        varicose veins"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df['text'] = df['text'].str.lower()\n",
+    "df['label'] = df['label'].str.lower()\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## ***7- Expand contractions***\n",
+    "**i.e. I'm => I am**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>text</th>\n",
+       "      <th>label</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>i am experiencing persistent symptoms of a col...</td>\n",
+       "      <td>common cold</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>i am experiencing intense symptoms including s...</td>\n",
+       "      <td>jaundice</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>i am experiencing a strong, unpleasant taste i...</td>\n",
+       "      <td>peptic ulcer disease</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>i have been experiencing digestive issues, inc...</td>\n",
+       "      <td>migraine</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>i have noticed that the veins on my calves hav...</td>\n",
+       "      <td>varicose veins</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                text                 label\n",
+       "0  i am experiencing persistent symptoms of a col...           common cold\n",
+       "1  i am experiencing intense symptoms including s...              jaundice\n",
+       "2  i am experiencing a strong, unpleasant taste i...  peptic ulcer disease\n",
+       "3  i have been experiencing digestive issues, inc...              migraine\n",
+       "4  i have noticed that the veins on my calves hav...        varicose veins"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df['text'] = df['text'].apply(contractions.fix)\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## ***8- Split into Train-Test and save in .csv files***\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train set shape: (4284, 2)\n",
+      "Test set shaape: (2857, 2)\n"
+     ]
+    }
+   ],
+   "source": [
+    "train_df, test_df = train_test_split(df, test_size=0.4, random_state=42)\n",
+    "print(f\"Train set shape: {train_df.shape}\")\n",
+    "print(f\"Test set shaape: {test_df.shape}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_df.to_csv(CONSTANTS.AUGMENTED_TRAIN_SET_PATH, index=False)\n",
+    "test_df.to_csv(CONSTANTS.AUGMENTED_TEST_SET_PATH, index=False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "NLP",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.20"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/constants.py
+++ b/constants.py
@@ -2,3 +2,6 @@ class CONSTANTS:
    DATASET_PATH = 'data/Symptom2Disease.csv'
    TRAIN_SET_PATH = 'data/Preprocessed_Symptom2Disease_Train.csv'
    TEST_SET_PATH = 'data/Preprocessed_Symptom2Disease_Test.csv'
+    AUGMENTED_DATASET_PATH = 'data/augmented_Symptom2Disease.csv'
+    AUGMENTED_TRAIN_SET_PATH = 'data/augmented_Preprocessed_Symptom2Disease_Train.csv'
+    AUGMENTED_TEST_SET_PATH = 'data/augmented_Preprocessed_Symptom2Disease_Test.csv'
\ No newline at end of file
--- a/data/augmented_Preprocessed_Symptom2Disease_Test.csv
+++ b/data/augmented_Preprocessed_Symptom2Disease_Test.csv
--- a/data/augmented_Preprocessed_Symptom2Disease_Train.csv
+++ b/data/augmented_Preprocessed_Symptom2Disease_Train.csv
--- a/data/augmented_Symptom2Disease.csv
+++ b/data/augmented_Symptom2Disease.csv