Commit 36920ef7 authored by Almouhannad Hafez's avatar Almouhannad Hafez

(4) Apply augmentation on train set only

parent b355fbd8
......@@ -22,7 +22,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
......@@ -44,7 +44,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
......@@ -68,7 +68,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
......@@ -102,7 +102,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 5,
"metadata": {},
"outputs": [
{
......@@ -140,7 +140,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
......@@ -180,14 +180,14 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done 10%\n",
" Done 10%\n",
"Done 20%\n",
"Done 30%\n",
"Done 40%\n",
......@@ -206,7 +206,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
......@@ -223,112 +223,59 @@
"# ***Preprocess augmented dataset***"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.utils import shuffle\n",
"from sklearn.model_selection import train_test_split\n",
"import contractions\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ***1- Refactor dataset columns so we have text and label only***\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>I have been experiencing a skin rash on my arm...</td>\n",
" <td>Psoriasis</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>My skin has been peeling, especially on my kne...</td>\n",
" <td>Psoriasis</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>I have been experiencing joint pain in my fing...</td>\n",
" <td>Psoriasis</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>There is a silver like dusting on my skin, esp...</td>\n",
" <td>Psoriasis</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>My nails have small dents or pits in them, and...</td>\n",
" <td>Psoriasis</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text label\n",
"0 I have been experiencing a skin rash on my arm... Psoriasis\n",
"1 My skin has been peeling, especially on my kne... Psoriasis\n",
"2 I have been experiencing joint pain in my fing... Psoriasis\n",
"3 There is a silver like dusting on my skin, esp... Psoriasis\n",
"4 My nails have small dents or pits in them, and... Psoriasis"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
"name": "stdout",
"output_type": "stream",
"text": [
" Id label text\n",
"0 0 Psoriasis I have been experiencing a skin rash on my arm...\n",
"1 1 Psoriasis My skin has been peeling, especially on my kne...\n",
"2 2 Psoriasis I have been experiencing joint pain in my fing...\n",
"3 3 Psoriasis There is a silver like dusting on my skin, esp...\n",
"4 4 Psoriasis My nails have small dents or pits in them, and...\n",
"... ... ... ...\n",
"1195 1195 diabetes I'm shaking and trembling all over. I've lost ...\n",
"1196 1196 diabetes Particularly in the crevices of my skin, I hav...\n",
"1197 1197 diabetes I regularly experience these intense urges and...\n",
"1198 1198 diabetes I have trouble breathing, especially outside. ...\n",
"1199 1199 diabetes I constantly sneeze and have a dry cough. My i...\n",
"\n",
"[1200 rows x 3 columns]\n",
" Unnamed: 0 label text\n",
"0 0 Psoriasis I have been experiencing a skin rash on my arm...\n",
"1 1 Psoriasis My skin has been peeling, especially on my kne...\n",
"2 2 Psoriasis I have been experiencing joint pain in my fing...\n",
"3 3 Psoriasis There is a silver like dusting on my skin, esp...\n",
"4 4 Psoriasis My nails have small dents or pits in them, and...\n",
"... ... ... ...\n",
"1195 295 diabetes I'm shaking and trembling all over. I've lost ...\n",
"1196 296 diabetes Particularly in the crevices of my skin, I hav...\n",
"1197 297 diabetes I regularly experience these intense urges and...\n",
"1198 298 diabetes I have trouble breathing, especially outside. ...\n",
"1199 299 diabetes I constantly sneeze and have a dry cough. My i...\n",
"\n",
"[1200 rows x 3 columns]\n"
]
}
],
"source": [
"df = augmented_df\n",
"\n",
"df = df.drop(columns=['Id'])\n",
"df = df[['text', 'label']]\n",
"\n",
"df.head()"
"from sklearn.utils import shuffle\n",
"from sklearn.model_selection import train_test_split\n",
"original_data = augmented_df.head(1200)\n",
"print(original_data)\n",
"print(df)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ***2- Check for duplicates***"
"## ***1. Split ORIGINAL DATASET into train and test sets***\n"
]
},
{
......@@ -340,59 +287,37 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Number of duplicates found: 59\n",
"Shape of after removing duplicates: (7141, 2)\n"
"Train set shape: (720, 3)\n",
"Test set shaape: (480, 3)\n"
]
}
],
"source": [
"duplicates = df[df.duplicated()]\n",
"print(f\"Number of duplicates found: {len(duplicates)}\")\n",
"\n",
"# Remove duplicates\n",
"df = df.drop_duplicates()\n",
"print(f\"Shape of after removing duplicates: {df.shape}\")"
"train_df, test_df = train_test_split(original_data, test_size=0.4, random_state=42)\n",
"print(f\"Train set shape: {train_df.shape}\")\n",
"print(f\"Test set shaape: {test_df.shape}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ***3- Check for nulls***\n"
"## ***2. Add augmented texts to TRAIN SET only***"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Count of null values in each column:\n",
"text 0\n",
"label 0\n",
"dtype: int64\n",
"Shape of the DataFrame after removing null values: (7141, 2)\n"
]
}
],
"source": [
"null_values = df.isnull().sum()\n",
"print(\"Count of null values in each column:\")\n",
"print(null_values)\n",
"\n",
"# Remove rows with null values\n",
"df = df.dropna()\n",
"print(f\"Shape of the DataFrame after removing null values: {df.shape}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"outputs": [],
"source": [
"## ***4- Shuffle dataset***\n"
"def get_augmented_texts_ids(original_id):\n",
" start = 1200\n",
" shift = 5 * original_id\n",
" augmented_texts_ids = []\n",
" for i in range(5):\n",
" augmented_texts_ids.append(start + shift + i)\n",
" return augmented_texts_ids"
]
},
{
......@@ -401,85 +326,33 @@
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>I'm experiencing persistent symptoms of a cold...</td>\n",
" <td>Common Cold</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>I'm experiencing intense symptoms including se...</td>\n",
" <td>Jaundice</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>I'm experiencing a strong, unpleasant taste in...</td>\n",
" <td>peptic ulcer disease</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>I have been experiencing digestive issues, inc...</td>\n",
" <td>Migraine</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>I've noticed that the veins on my calves have ...</td>\n",
" <td>Varicose Veins</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text label\n",
"0 I'm experiencing persistent symptoms of a cold... Common Cold\n",
"1 I'm experiencing intense symptoms including se... Jaundice\n",
"2 I'm experiencing a strong, unpleasant taste in... peptic ulcer disease\n",
"3 I have been experiencing digestive issues, inc... Migraine\n",
"4 I've noticed that the veins on my calves have ... Varicose Veins"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
"name": "stdout",
"output_type": "stream",
"text": [
"New train_set shape: (4320, 3)\n"
]
}
],
"source": [
"df = shuffle(df, random_state=42).reset_index(drop=True)\n",
"original_ids = train_df[\"Id\"]\n",
"dataframes_to_concat = []\n",
"\n",
"for id in original_ids:\n",
" for augmented_id in get_augmented_texts_ids(id):\n",
" augmented_rows = augmented_df[augmented_df[\"Id\"] == augmented_id]\n",
" dataframes_to_concat.append(augmented_rows)\n",
"\n",
"augmented_data = pd.concat(dataframes_to_concat, ignore_index=True)\n",
"train_df = pd.concat([train_df, augmented_data], ignore_index=True)\n",
"\n",
"df.head()"
"print(f\"New train_set shape: {train_df.shape}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ***5- Count unique texts for each label***\n"
"## ***3- Count unique texts for each label***"
]
},
{
......@@ -491,248 +364,103 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Count of unique text values for each label\n",
"Count of unique text values for each label in train set\n",
"label\n",
"Jaundice 288\n",
"Dimorphic Hemorrhoids 291\n",
"Malaria 292\n",
"Migraine 296\n",
"Acne 296\n",
"Arthritis 296\n",
"Pneumonia 297\n",
"gastroesophageal reflux disease 297\n",
"Dengue 298\n",
"Chicken pox 298\n",
"drug reaction 299\n",
"diabetes 299\n",
"Typhoid 299\n",
"Common Cold 299\n",
"Impetigo 299\n",
"Hypertension 299\n",
"Cervical spondylosis 299\n",
"Bronchial Asthma 299\n",
"peptic ulcer disease 300\n",
"Psoriasis 300\n",
"Varicose Veins 300\n",
"allergy 300\n",
"Fungal infection 300\n",
"urinary tract infection 300\n",
"Fungal infection 144\n",
"Varicose Veins 150\n",
"Migraine 155\n",
"urinary tract infection 162\n",
"Impetigo 162\n",
"Jaundice 164\n",
"Dengue 166\n",
"Pneumonia 168\n",
"Acne 172\n",
"Cervical spondylosis 173\n",
"Common Cold 174\n",
"gastroesophageal reflux disease 174\n",
"Arthritis 179\n",
"Dimorphic Hemorrhoids 184\n",
"Bronchial Asthma 185\n",
"Hypertension 191\n",
"peptic ulcer disease 192\n",
"Typhoid 192\n",
"Malaria 194\n",
"diabetes 197\n",
"Psoriasis 198\n",
"drug reaction 204\n",
"Chicken pox 210\n",
"allergy 210\n",
"Name: text, dtype: int64\n"
]
}
],
"source": [
"unique_text_counts = df.groupby('label')['text'].nunique()\n",
"# Train set\n",
"unique_text_counts = train_df.groupby('label')['text'].nunique()\n",
"unique_text_counts = unique_text_counts.sort_values()\n",
"\n",
"print(\"Count of unique text values for each label\")\n",
"print(\"Count of unique text values for each label in train set\")\n",
"print(unique_text_counts)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ***6- Convert into lowercase***\n"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>i'm experiencing persistent symptoms of a cold...</td>\n",
" <td>common cold</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>i'm experiencing intense symptoms including se...</td>\n",
" <td>jaundice</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>i'm experiencing a strong, unpleasant taste in...</td>\n",
" <td>peptic ulcer disease</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>i have been experiencing digestive issues, inc...</td>\n",
" <td>migraine</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>i've noticed that the veins on my calves have ...</td>\n",
" <td>varicose veins</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text label\n",
"0 i'm experiencing persistent symptoms of a cold... common cold\n",
"1 i'm experiencing intense symptoms including se... jaundice\n",
"2 i'm experiencing a strong, unpleasant taste in... peptic ulcer disease\n",
"3 i have been experiencing digestive issues, inc... migraine\n",
"4 i've noticed that the veins on my calves have ... varicose veins"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['text'] = df['text'].str.lower()\n",
"df['label'] = df['label'].str.lower()\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ***7- Expand contractions***\n",
"**i.e. I'm => I am**"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>i am experiencing persistent symptoms of a col...</td>\n",
" <td>common cold</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>i am experiencing intense symptoms including s...</td>\n",
" <td>jaundice</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>i am experiencing a strong, unpleasant taste i...</td>\n",
" <td>peptic ulcer disease</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>i have been experiencing digestive issues, inc...</td>\n",
" <td>migraine</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>i have noticed that the veins on my calves hav...</td>\n",
" <td>varicose veins</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text label\n",
"0 i am experiencing persistent symptoms of a col... common cold\n",
"1 i am experiencing intense symptoms including s... jaundice\n",
"2 i am experiencing a strong, unpleasant taste i... peptic ulcer disease\n",
"3 i have been experiencing digestive issues, inc... migraine\n",
"4 i have noticed that the veins on my calves hav... varicose veins"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
"name": "stdout",
"output_type": "stream",
"text": [
"Count of unique text values for each label in test set\n",
"label\n",
"Chicken pox 15\n",
"allergy 15\n",
"drug reaction 16\n",
"diabetes 17\n",
"Dimorphic Hemorrhoids 17\n",
"Malaria 17\n",
"Psoriasis 17\n",
"Hypertension 18\n",
"Typhoid 18\n",
"peptic ulcer disease 18\n",
"Arthritis 19\n",
"Bronchial Asthma 19\n",
"Jaundice 20\n",
"gastroesophageal reflux disease 21\n",
"Acne 21\n",
"Common Cold 21\n",
"Cervical spondylosis 21\n",
"Pneumonia 21\n",
"Dengue 22\n",
"Migraine 23\n",
"Impetigo 23\n",
"urinary tract infection 23\n",
"Varicose Veins 25\n",
"Fungal infection 26\n",
"Name: text, dtype: int64\n"
]
}
],
"source": [
"df['text'] = df['text'].apply(contractions.fix)\n",
"df.head()"
"# Test set\n",
"unique_text_counts = test_df.groupby('label')['text'].nunique()\n",
"unique_text_counts = unique_text_counts.sort_values()\n",
"\n",
"print(\"Count of unique text values for each label in test set\")\n",
"print(unique_text_counts)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ***8- Split into Train-Test and save in .csv files***\n"
"## ***4- Save in .csv files***"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train set shape: (4284, 2)\n",
"Test set shaape: (2857, 2)\n"
]
}
],
"source": [
"train_df, test_df = train_test_split(df, test_size=0.4, random_state=42)\n",
"print(f\"Train set shape: {train_df.shape}\")\n",
"print(f\"Test set shaape: {test_df.shape}\")"
]
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
......
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment