Commit 36920ef7 authored by Almouhannad Hafez's avatar Almouhannad Hafez

(4) Apply augmentation on train set only

parent b355fbd8
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 2,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
...@@ -44,7 +44,7 @@ ...@@ -44,7 +44,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": 3,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
...@@ -68,7 +68,7 @@ ...@@ -68,7 +68,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": 4,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
...@@ -102,7 +102,7 @@ ...@@ -102,7 +102,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": 5,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
...@@ -140,7 +140,7 @@ ...@@ -140,7 +140,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": 6,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
...@@ -180,14 +180,14 @@ ...@@ -180,14 +180,14 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 6, "execution_count": 7,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"Done 10%\n", " Done 10%\n",
"Done 20%\n", "Done 20%\n",
"Done 30%\n", "Done 30%\n",
"Done 40%\n", "Done 40%\n",
...@@ -206,7 +206,7 @@ ...@@ -206,7 +206,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 7, "execution_count": 8,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
...@@ -223,112 +223,59 @@ ...@@ -223,112 +223,59 @@
"# ***Preprocess augmented dataset***" "# ***Preprocess augmented dataset***"
] ]
}, },
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.utils import shuffle\n",
"from sklearn.model_selection import train_test_split\n",
"import contractions\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ***1- Refactor dataset columns so we have text and label only***\n"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 9, "execution_count": 9,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "name": "stdout",
"text/html": [ "output_type": "stream",
"<div>\n", "text": [
"<style scoped>\n", " Id label text\n",
" .dataframe tbody tr th:only-of-type {\n", "0 0 Psoriasis I have been experiencing a skin rash on my arm...\n",
" vertical-align: middle;\n", "1 1 Psoriasis My skin has been peeling, especially on my kne...\n",
" }\n", "2 2 Psoriasis I have been experiencing joint pain in my fing...\n",
"\n", "3 3 Psoriasis There is a silver like dusting on my skin, esp...\n",
" .dataframe tbody tr th {\n", "4 4 Psoriasis My nails have small dents or pits in them, and...\n",
" vertical-align: top;\n", "... ... ... ...\n",
" }\n", "1195 1195 diabetes I'm shaking and trembling all over. I've lost ...\n",
"\n", "1196 1196 diabetes Particularly in the crevices of my skin, I hav...\n",
" .dataframe thead th {\n", "1197 1197 diabetes I regularly experience these intense urges and...\n",
" text-align: right;\n", "1198 1198 diabetes I have trouble breathing, especially outside. ...\n",
" }\n", "1199 1199 diabetes I constantly sneeze and have a dry cough. My i...\n",
"</style>\n", "\n",
"<table border=\"1\" class=\"dataframe\">\n", "[1200 rows x 3 columns]\n",
" <thead>\n", " Unnamed: 0 label text\n",
" <tr style=\"text-align: right;\">\n", "0 0 Psoriasis I have been experiencing a skin rash on my arm...\n",
" <th></th>\n", "1 1 Psoriasis My skin has been peeling, especially on my kne...\n",
" <th>text</th>\n", "2 2 Psoriasis I have been experiencing joint pain in my fing...\n",
" <th>label</th>\n", "3 3 Psoriasis There is a silver like dusting on my skin, esp...\n",
" </tr>\n", "4 4 Psoriasis My nails have small dents or pits in them, and...\n",
" </thead>\n", "... ... ... ...\n",
" <tbody>\n", "1195 295 diabetes I'm shaking and trembling all over. I've lost ...\n",
" <tr>\n", "1196 296 diabetes Particularly in the crevices of my skin, I hav...\n",
" <th>0</th>\n", "1197 297 diabetes I regularly experience these intense urges and...\n",
" <td>I have been experiencing a skin rash on my arm...</td>\n", "1198 298 diabetes I have trouble breathing, especially outside. ...\n",
" <td>Psoriasis</td>\n", "1199 299 diabetes I constantly sneeze and have a dry cough. My i...\n",
" </tr>\n", "\n",
" <tr>\n", "[1200 rows x 3 columns]\n"
" <th>1</th>\n", ]
" <td>My skin has been peeling, especially on my kne...</td>\n",
" <td>Psoriasis</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>I have been experiencing joint pain in my fing...</td>\n",
" <td>Psoriasis</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>There is a silver like dusting on my skin, esp...</td>\n",
" <td>Psoriasis</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>My nails have small dents or pits in them, and...</td>\n",
" <td>Psoriasis</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text label\n",
"0 I have been experiencing a skin rash on my arm... Psoriasis\n",
"1 My skin has been peeling, especially on my kne... Psoriasis\n",
"2 I have been experiencing joint pain in my fing... Psoriasis\n",
"3 There is a silver like dusting on my skin, esp... Psoriasis\n",
"4 My nails have small dents or pits in them, and... Psoriasis"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
} }
], ],
"source": [ "source": [
"df = augmented_df\n", "from sklearn.utils import shuffle\n",
"\n", "from sklearn.model_selection import train_test_split\n",
"df = df.drop(columns=['Id'])\n", "original_data = augmented_df.head(1200)\n",
"df = df[['text', 'label']]\n", "print(original_data)\n",
"\n", "print(df)"
"df.head()"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## ***2- Check for duplicates***" "## ***1. Split ORIGINAL DATASET into train and test sets***\n"
] ]
}, },
{ {
...@@ -340,59 +287,37 @@ ...@@ -340,59 +287,37 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"Number of duplicates found: 59\n", "Train set shape: (720, 3)\n",
"Shape of after removing duplicates: (7141, 2)\n" "Test set shaape: (480, 3)\n"
] ]
} }
], ],
"source": [ "source": [
"duplicates = df[df.duplicated()]\n", "train_df, test_df = train_test_split(original_data, test_size=0.4, random_state=42)\n",
"print(f\"Number of duplicates found: {len(duplicates)}\")\n", "print(f\"Train set shape: {train_df.shape}\")\n",
"\n", "print(f\"Test set shaape: {test_df.shape}\")"
"# Remove duplicates\n",
"df = df.drop_duplicates()\n",
"print(f\"Shape of after removing duplicates: {df.shape}\")"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## ***3- Check for nulls***\n" "## ***2. Add augmented texts to TRAIN SET only***"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 11, "execution_count": 11,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [],
{
"name": "stdout",
"output_type": "stream",
"text": [
"Count of null values in each column:\n",
"text 0\n",
"label 0\n",
"dtype: int64\n",
"Shape of the DataFrame after removing null values: (7141, 2)\n"
]
}
],
"source": [
"null_values = df.isnull().sum()\n",
"print(\"Count of null values in each column:\")\n",
"print(null_values)\n",
"\n",
"# Remove rows with null values\n",
"df = df.dropna()\n",
"print(f\"Shape of the DataFrame after removing null values: {df.shape}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [ "source": [
"## ***4- Shuffle dataset***\n" "def get_augmented_texts_ids(original_id):\n",
" start = 1200\n",
" shift = 5 * original_id\n",
" augmented_texts_ids = []\n",
" for i in range(5):\n",
" augmented_texts_ids.append(start + shift + i)\n",
" return augmented_texts_ids"
] ]
}, },
{ {
...@@ -401,85 +326,33 @@ ...@@ -401,85 +326,33 @@
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "name": "stdout",
"text/html": [ "output_type": "stream",
"<div>\n", "text": [
"<style scoped>\n", "New train_set shape: (4320, 3)\n"
" .dataframe tbody tr th:only-of-type {\n", ]
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>I'm experiencing persistent symptoms of a cold...</td>\n",
" <td>Common Cold</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>I'm experiencing intense symptoms including se...</td>\n",
" <td>Jaundice</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>I'm experiencing a strong, unpleasant taste in...</td>\n",
" <td>peptic ulcer disease</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>I have been experiencing digestive issues, inc...</td>\n",
" <td>Migraine</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>I've noticed that the veins on my calves have ...</td>\n",
" <td>Varicose Veins</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text label\n",
"0 I'm experiencing persistent symptoms of a cold... Common Cold\n",
"1 I'm experiencing intense symptoms including se... Jaundice\n",
"2 I'm experiencing a strong, unpleasant taste in... peptic ulcer disease\n",
"3 I have been experiencing digestive issues, inc... Migraine\n",
"4 I've noticed that the veins on my calves have ... Varicose Veins"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
} }
], ],
"source": [ "source": [
"df = shuffle(df, random_state=42).reset_index(drop=True)\n", "original_ids = train_df[\"Id\"]\n",
"dataframes_to_concat = []\n",
"\n",
"for id in original_ids:\n",
" for augmented_id in get_augmented_texts_ids(id):\n",
" augmented_rows = augmented_df[augmented_df[\"Id\"] == augmented_id]\n",
" dataframes_to_concat.append(augmented_rows)\n",
"\n",
"augmented_data = pd.concat(dataframes_to_concat, ignore_index=True)\n",
"train_df = pd.concat([train_df, augmented_data], ignore_index=True)\n",
"\n", "\n",
"df.head()" "print(f\"New train_set shape: {train_df.shape}\")"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## ***5- Count unique texts for each label***\n" "## ***3- Count unique texts for each label***"
] ]
}, },
{ {
...@@ -491,248 +364,103 @@ ...@@ -491,248 +364,103 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"Count of unique text values for each label\n", "Count of unique text values for each label in train set\n",
"label\n", "label\n",
"Jaundice 288\n", "Fungal infection 144\n",
"Dimorphic Hemorrhoids 291\n", "Varicose Veins 150\n",
"Malaria 292\n", "Migraine 155\n",
"Migraine 296\n", "urinary tract infection 162\n",
"Acne 296\n", "Impetigo 162\n",
"Arthritis 296\n", "Jaundice 164\n",
"Pneumonia 297\n", "Dengue 166\n",
"gastroesophageal reflux disease 297\n", "Pneumonia 168\n",
"Dengue 298\n", "Acne 172\n",
"Chicken pox 298\n", "Cervical spondylosis 173\n",
"drug reaction 299\n", "Common Cold 174\n",
"diabetes 299\n", "gastroesophageal reflux disease 174\n",
"Typhoid 299\n", "Arthritis 179\n",
"Common Cold 299\n", "Dimorphic Hemorrhoids 184\n",
"Impetigo 299\n", "Bronchial Asthma 185\n",
"Hypertension 299\n", "Hypertension 191\n",
"Cervical spondylosis 299\n", "peptic ulcer disease 192\n",
"Bronchial Asthma 299\n", "Typhoid 192\n",
"peptic ulcer disease 300\n", "Malaria 194\n",
"Psoriasis 300\n", "diabetes 197\n",
"Varicose Veins 300\n", "Psoriasis 198\n",
"allergy 300\n", "drug reaction 204\n",
"Fungal infection 300\n", "Chicken pox 210\n",
"urinary tract infection 300\n", "allergy 210\n",
"Name: text, dtype: int64\n" "Name: text, dtype: int64\n"
] ]
} }
], ],
"source": [ "source": [
"unique_text_counts = df.groupby('label')['text'].nunique()\n", "# Train set\n",
"unique_text_counts = train_df.groupby('label')['text'].nunique()\n",
"unique_text_counts = unique_text_counts.sort_values()\n", "unique_text_counts = unique_text_counts.sort_values()\n",
"\n", "\n",
"print(\"Count of unique text values for each label\")\n", "print(\"Count of unique text values for each label in train set\")\n",
"print(unique_text_counts)" "print(unique_text_counts)"
] ]
}, },
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ***6- Convert into lowercase***\n"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 14, "execution_count": 14,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "name": "stdout",
"text/html": [ "output_type": "stream",
"<div>\n", "text": [
"<style scoped>\n", "Count of unique text values for each label in test set\n",
" .dataframe tbody tr th:only-of-type {\n", "label\n",
" vertical-align: middle;\n", "Chicken pox 15\n",
" }\n", "allergy 15\n",
"\n", "drug reaction 16\n",
" .dataframe tbody tr th {\n", "diabetes 17\n",
" vertical-align: top;\n", "Dimorphic Hemorrhoids 17\n",
" }\n", "Malaria 17\n",
"\n", "Psoriasis 17\n",
" .dataframe thead th {\n", "Hypertension 18\n",
" text-align: right;\n", "Typhoid 18\n",
" }\n", "peptic ulcer disease 18\n",
"</style>\n", "Arthritis 19\n",
"<table border=\"1\" class=\"dataframe\">\n", "Bronchial Asthma 19\n",
" <thead>\n", "Jaundice 20\n",
" <tr style=\"text-align: right;\">\n", "gastroesophageal reflux disease 21\n",
" <th></th>\n", "Acne 21\n",
" <th>text</th>\n", "Common Cold 21\n",
" <th>label</th>\n", "Cervical spondylosis 21\n",
" </tr>\n", "Pneumonia 21\n",
" </thead>\n", "Dengue 22\n",
" <tbody>\n", "Migraine 23\n",
" <tr>\n", "Impetigo 23\n",
" <th>0</th>\n", "urinary tract infection 23\n",
" <td>i'm experiencing persistent symptoms of a cold...</td>\n", "Varicose Veins 25\n",
" <td>common cold</td>\n", "Fungal infection 26\n",
" </tr>\n", "Name: text, dtype: int64\n"
" <tr>\n", ]
" <th>1</th>\n",
" <td>i'm experiencing intense symptoms including se...</td>\n",
" <td>jaundice</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>i'm experiencing a strong, unpleasant taste in...</td>\n",
" <td>peptic ulcer disease</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>i have been experiencing digestive issues, inc...</td>\n",
" <td>migraine</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>i've noticed that the veins on my calves have ...</td>\n",
" <td>varicose veins</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text label\n",
"0 i'm experiencing persistent symptoms of a cold... common cold\n",
"1 i'm experiencing intense symptoms including se... jaundice\n",
"2 i'm experiencing a strong, unpleasant taste in... peptic ulcer disease\n",
"3 i have been experiencing digestive issues, inc... migraine\n",
"4 i've noticed that the veins on my calves have ... varicose veins"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['text'] = df['text'].str.lower()\n",
"df['label'] = df['label'].str.lower()\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ***7- Expand contractions***\n",
"**i.e. I'm => I am**"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>i am experiencing persistent symptoms of a col...</td>\n",
" <td>common cold</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>i am experiencing intense symptoms including s...</td>\n",
" <td>jaundice</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>i am experiencing a strong, unpleasant taste i...</td>\n",
" <td>peptic ulcer disease</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>i have been experiencing digestive issues, inc...</td>\n",
" <td>migraine</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>i have noticed that the veins on my calves hav...</td>\n",
" <td>varicose veins</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text label\n",
"0 i am experiencing persistent symptoms of a col... common cold\n",
"1 i am experiencing intense symptoms including s... jaundice\n",
"2 i am experiencing a strong, unpleasant taste i... peptic ulcer disease\n",
"3 i have been experiencing digestive issues, inc... migraine\n",
"4 i have noticed that the veins on my calves hav... varicose veins"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
} }
], ],
"source": [ "source": [
"df['text'] = df['text'].apply(contractions.fix)\n", "# Test set\n",
"df.head()" "unique_text_counts = test_df.groupby('label')['text'].nunique()\n",
"unique_text_counts = unique_text_counts.sort_values()\n",
"\n",
"print(\"Count of unique text values for each label in test set\")\n",
"print(unique_text_counts)"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## ***8- Split into Train-Test and save in .csv files***\n" "## ***4- Save in .csv files***"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 16, "execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train set shape: (4284, 2)\n",
"Test set shaape: (2857, 2)\n"
]
}
],
"source": [
"train_df, test_df = train_test_split(df, test_size=0.4, random_state=42)\n",
"print(f\"Train set shape: {train_df.shape}\")\n",
"print(f\"Test set shaape: {test_df.shape}\")"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
......
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment