Commit a3277a8d authored by Almouhannad Hafez's avatar Almouhannad Hafez

Dataset Augmentation

parent af95466f
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ***Setup***"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"from ollama import Client\n",
"\n",
"import re\n",
"\n",
"from constants import CONSTANTS"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ***Read dataset***"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(CONSTANTS.DATASET_PATH)\n",
"assert df.shape == (1200, 3), f\"Expected shape (1200, 3), but got {df.shape}\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ***Some helper functions***"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### ***Send prompts to `llama3` API***"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"client = Client(host='http://172.25.1.141:11434')\n",
"\n",
"def clean_generated_text(text):\n",
" cleaned_text = re.sub(r'[^\\w\\s,.\\']', '', text) # Remove special characters\n",
" cleaned_text = cleaned_text.replace('\\n', ' ') # replace newlines with spaces\n",
" cleaned_text = ' '.join(cleaned_text.split()) # eemove extra spaces\n",
" return cleaned_text.strip() # Trim leading and trailing spaces\n",
"\n",
"\n",
"def send_prompt(prompt):\n",
" response = client.chat(model='llama3', messages=[\n",
" {\n",
" 'role': 'user',\n",
" 'content': prompt + '\\nSend the response only.',\n",
" },\n",
" ])\n",
" \n",
" # Extracting the content from the response\n",
" return clean_generated_text(response['message']['content'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### ***Example usage***\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"I've noticed a persistent skin rash on my arms, legs, and torso over the past few weeks. The affected areas appear red, itchy, and are characterized by dry, scaling patches.\n",
"For the past few weeks, I've been dealing with a persistent skin rash that affects my arms, legs, and torso. The rash appears as red, itchy areas with dry, flaky patches.\n",
"For several weeks, I've been dealing with a persistent skin issue affecting my arms, legs, and torso. The rash appears as red, itchy areas covered in dry, flaky scales.\n",
"For several weeks, I've noticed a persistent skin issue affecting my arms, legs, and torso. The affected areas appear red, irritated, and feature dry, flaky scales.\n",
"For the past few weeks, I've been dealing with an uncomfortable skin issue that's affected my arms, legs, and torso. The rash has characterized by a reddish hue, intense itchiness, and the presence of dry, flaky scales.\n"
]
}
],
"source": [
"text = \"I have been experiencing a skin rash on my arms, legs, and torso for the past few weeks. It is red, itchy, and covered in dry, scaly patches.\"\n",
"for i in range(5):\n",
" result_i = send_prompt(f'Rephrase the following text:\\n{text}')\n",
" print(result_i)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### ***Augment dataset using `llm`***\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**`n_augmentations` is number of rephrased texts to generate for each row in the original dataset**"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def augment_dataset(df, text_column=\"text\", label_column=\"label\", n_augmentations=5):\n",
" augmented_texts = []\n",
" augmented_labels = []\n",
" total_rows = len(df)\n",
" \n",
" \n",
" if total_rows < 10:\n",
" progress_step = 1\n",
" else:\n",
" progress_step = total_rows // 10\n",
"\n",
" for index, row in df.iterrows():\n",
" text, label = row[text_column], row[label_column]\n",
" for _ in range(n_augmentations):\n",
" augmented_text = send_prompt(f'Rephrase the following text:\\n{text}')\n",
" augmented_texts.append(augmented_text)\n",
" augmented_labels.append(label)\n",
" \n",
" # Verbose section\n",
" if (index + 1) % progress_step == 0:\n",
" percentage_done = (index + 1) / total_rows * 100\n",
" print(f\"Done {percentage_done:.0f}%\")\n",
"\n",
" augmented_df = pd.DataFrame({text_column: augmented_texts, label_column: augmented_labels})\n",
" return pd.concat([df, augmented_df], ignore_index=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ***Augment dataset***"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done 10%\n",
"Done 20%\n",
"Done 30%\n",
"Done 40%\n",
"Done 50%\n",
"Done 60%\n",
"Done 70%\n",
"Done 80%\n",
"Done 90%\n",
"Done 100%\n"
]
}
],
"source": [
"augmented_df = augment_dataset(df, n_augmentations=5)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"augmented_df.to_csv(CONSTANTS.AUGMENTED_DATASET_PATH, index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ***Preprocess augmented dataset***"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.utils import shuffle\n",
"from sklearn.model_selection import train_test_split\n",
"import contractions\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ***1- Refactor dataset columns so we have text and label only***\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>I have been experiencing a skin rash on my arm...</td>\n",
" <td>Psoriasis</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>My skin has been peeling, especially on my kne...</td>\n",
" <td>Psoriasis</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>I have been experiencing joint pain in my fing...</td>\n",
" <td>Psoriasis</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>There is a silver like dusting on my skin, esp...</td>\n",
" <td>Psoriasis</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>My nails have small dents or pits in them, and...</td>\n",
" <td>Psoriasis</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text label\n",
"0 I have been experiencing a skin rash on my arm... Psoriasis\n",
"1 My skin has been peeling, especially on my kne... Psoriasis\n",
"2 I have been experiencing joint pain in my fing... Psoriasis\n",
"3 There is a silver like dusting on my skin, esp... Psoriasis\n",
"4 My nails have small dents or pits in them, and... Psoriasis"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = augmented_df\n",
"\n",
"df = df.drop(columns=['Unnamed: 0'])\n",
"df = df[['text', 'label']]\n",
"\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ***2- Check for duplicates***"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of duplicates found: 59\n",
"Shape of after removing duplicates: (7141, 2)\n"
]
}
],
"source": [
"duplicates = df[df.duplicated()]\n",
"print(f\"Number of duplicates found: {len(duplicates)}\")\n",
"\n",
"# Remove duplicates\n",
"df = df.drop_duplicates()\n",
"print(f\"Shape of after removing duplicates: {df.shape}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ***3- Check for nulls***\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Count of null values in each column:\n",
"text 0\n",
"label 0\n",
"dtype: int64\n",
"Shape of the DataFrame after removing null values: (7141, 2)\n"
]
}
],
"source": [
"null_values = df.isnull().sum()\n",
"print(\"Count of null values in each column:\")\n",
"print(null_values)\n",
"\n",
"# Remove rows with null values\n",
"df = df.dropna()\n",
"print(f\"Shape of the DataFrame after removing null values: {df.shape}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ***4- Shuffle dataset***\n"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>I'm experiencing persistent symptoms of a cold...</td>\n",
" <td>Common Cold</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>I'm experiencing intense symptoms including se...</td>\n",
" <td>Jaundice</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>I'm experiencing a strong, unpleasant taste in...</td>\n",
" <td>peptic ulcer disease</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>I have been experiencing digestive issues, inc...</td>\n",
" <td>Migraine</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>I've noticed that the veins on my calves have ...</td>\n",
" <td>Varicose Veins</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text label\n",
"0 I'm experiencing persistent symptoms of a cold... Common Cold\n",
"1 I'm experiencing intense symptoms including se... Jaundice\n",
"2 I'm experiencing a strong, unpleasant taste in... peptic ulcer disease\n",
"3 I have been experiencing digestive issues, inc... Migraine\n",
"4 I've noticed that the veins on my calves have ... Varicose Veins"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = shuffle(df, random_state=42).reset_index(drop=True)\n",
"\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ***5- Count unique texts for each label***\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Count of unique text values for each label\n",
"label\n",
"Jaundice 288\n",
"Dimorphic Hemorrhoids 291\n",
"Malaria 292\n",
"Migraine 296\n",
"Acne 296\n",
"Arthritis 296\n",
"Pneumonia 297\n",
"gastroesophageal reflux disease 297\n",
"Dengue 298\n",
"Chicken pox 298\n",
"drug reaction 299\n",
"diabetes 299\n",
"Typhoid 299\n",
"Common Cold 299\n",
"Impetigo 299\n",
"Hypertension 299\n",
"Cervical spondylosis 299\n",
"Bronchial Asthma 299\n",
"peptic ulcer disease 300\n",
"Psoriasis 300\n",
"Varicose Veins 300\n",
"allergy 300\n",
"Fungal infection 300\n",
"urinary tract infection 300\n",
"Name: text, dtype: int64\n"
]
}
],
"source": [
"unique_text_counts = df.groupby('label')['text'].nunique()\n",
"unique_text_counts = unique_text_counts.sort_values()\n",
"\n",
"print(\"Count of unique text values for each label\")\n",
"print(unique_text_counts)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ***6- Convert into lowercase***\n"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>i'm experiencing persistent symptoms of a cold...</td>\n",
" <td>common cold</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>i'm experiencing intense symptoms including se...</td>\n",
" <td>jaundice</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>i'm experiencing a strong, unpleasant taste in...</td>\n",
" <td>peptic ulcer disease</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>i have been experiencing digestive issues, inc...</td>\n",
" <td>migraine</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>i've noticed that the veins on my calves have ...</td>\n",
" <td>varicose veins</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text label\n",
"0 i'm experiencing persistent symptoms of a cold... common cold\n",
"1 i'm experiencing intense symptoms including se... jaundice\n",
"2 i'm experiencing a strong, unpleasant taste in... peptic ulcer disease\n",
"3 i have been experiencing digestive issues, inc... migraine\n",
"4 i've noticed that the veins on my calves have ... varicose veins"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['text'] = df['text'].str.lower()\n",
"df['label'] = df['label'].str.lower()\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ***7- Expand contractions***\n",
"**i.e. I'm => I am**"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>i am experiencing persistent symptoms of a col...</td>\n",
" <td>common cold</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>i am experiencing intense symptoms including s...</td>\n",
" <td>jaundice</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>i am experiencing a strong, unpleasant taste i...</td>\n",
" <td>peptic ulcer disease</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>i have been experiencing digestive issues, inc...</td>\n",
" <td>migraine</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>i have noticed that the veins on my calves hav...</td>\n",
" <td>varicose veins</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text label\n",
"0 i am experiencing persistent symptoms of a col... common cold\n",
"1 i am experiencing intense symptoms including s... jaundice\n",
"2 i am experiencing a strong, unpleasant taste i... peptic ulcer disease\n",
"3 i have been experiencing digestive issues, inc... migraine\n",
"4 i have noticed that the veins on my calves hav... varicose veins"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['text'] = df['text'].apply(contractions.fix)\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ***8- Split into Train-Test and save in .csv files***\n"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train set shape: (4284, 2)\n",
"Test set shaape: (2857, 2)\n"
]
}
],
"source": [
"train_df, test_df = train_test_split(df, test_size=0.4, random_state=42)\n",
"print(f\"Train set shape: {train_df.shape}\")\n",
"print(f\"Test set shaape: {test_df.shape}\")"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"train_df.to_csv(CONSTANTS.AUGMENTED_TRAIN_SET_PATH, index=False)\n",
"test_df.to_csv(CONSTANTS.AUGMENTED_TEST_SET_PATH, index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "NLP",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.20"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
class CONSTANTS:
DATASET_PATH = 'data/Symptom2Disease.csv'
TRAIN_SET_PATH = 'data/Preprocessed_Symptom2Disease_Train.csv'
TEST_SET_PATH = 'data/Preprocessed_Symptom2Disease_Test.csv'
\ No newline at end of file
TEST_SET_PATH = 'data/Preprocessed_Symptom2Disease_Test.csv'
AUGMENTED_DATASET_PATH = 'data/augmented_Symptom2Disease.csv'
AUGMENTED_TRAIN_SET_PATH = 'data/augmented_Preprocessed_Symptom2Disease_Train.csv'
AUGMENTED_TEST_SET_PATH = 'data/augmented_Preprocessed_Symptom2Disease_Test.csv'
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment