(0) Add dataset overview

8917cc79 · Almouhannad Hafez · 673dc40c · 8917cc79 · 8917cc79 · 8917cc79
Commit 8917cc79 authored Nov 09, 2024 by Almouhannad Hafez
5 changed files
--- a/0.Dataset_Overview.ipynb
+++ b/0.Dataset_Overview.ipynb
--- a/2.Stemmer.ipynb
+++ b/2.Stemmer.ipynb
@@ -13,7 +13,6 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "import nltk\n",
    "from nltk.stem import PorterStemmer\n",
    "from nltk.tokenize import word_tokenize\n",
    "from nltk.corpus import stopwords\n",
@@ -29,58 +28,6 @@
    "from constants import CONSTANTS"
   ]
  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "##### Now, we have to download `Punkt Tokenizer Model`, try running following cell, if it didn't work successfully then try to download model manually from following links: [Manual installation](https://www.nltk.org/data.html), and [Model link](https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip)."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**Uncomment if you haven't already**"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# nltk.download('punkt')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# nltk.download('stopwords')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# nltk.download('punkt_tab')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "##### You must see an output similar to the following output:\n",
-    "> `[nltk_data] Downloading package punkt to`  \n",
-    "> `[nltk_data]     ...\\AppData\\Roaming\\nltk_data...`  \n",
-    "> `[nltk_data]   Package punkt is already up-to-date!`  \n",
-    "> `True`"
-   ]
-  },
  {
   "cell_type": "markdown",
   "metadata": {},

--- a/3.1.Lemmatizer.ipynb
+++ b/3.1.Lemmatizer.ipynb
@@ -42,33 +42,6 @@
    "**Uncomment if you haven't already**"
   ]
  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# nltk.download('wordnet')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# nltk.download('averaged_perceptron_tagger')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# nltk.download('averaged_perceptron_tagger_eng')"
-   ]
-  },
  {
   "cell_type": "code",
   "execution_count": 5,

--- a/3.4.N-Grams-Multi.ipynb
+++ b/3.4.N-Grams-Multi.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# ***Setup***"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from sklearn.naive_bayes import MultinomialNB\n",
+    "from sklearn.metrics import accuracy_score, classification_report\n",
+    "\n",
+    "from tabulate import tabulate\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "from constants import CONSTANTS"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import nltk\n",
+    "from nltk.stem import PorterStemmer\n",
+    "from nltk.tokenize import word_tokenize\n",
+    "from nltk.corpus import stopwords"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The NLP process we're practicing is using N-grams"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "***Some helper functions***"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "stemmer = PorterStemmer()\n",
+    "\n",
+    "def stem_text(text):\n",
+    "    tokens = word_tokenize(text)  # Tokenize the text\n",
+    "    stemmed_tokens = [stemmer.stem(token) for token in tokens]  # Apply stemming\n",
+    "    return ' '.join(stemmed_tokens)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def print_clf_report_as_table(report):\n",
+    "    data = []\n",
+    "    for key, value in report.items():\n",
+    "        if key != 'accuracy' and key != 'macro avg' and key != 'weighted avg':\n",
+    "            data.append([key, value['precision'], value['recall'], value['f1-score'], value['support']])\n",
+    "\n",
+    "    data.append(['accuracy', '', '', report['accuracy'], ''])\n",
+    "\n",
+    "    data.append(['macro avg', report['macro avg']['precision'], report['macro avg']['recall'], report['macro avg']['f1-score'], ''])\n",
+    "\n",
+    "    data.append(['weighted avg', report['weighted avg']['precision'], report['weighted avg']['recall'], report['weighted avg']['f1-score'], ''])\n",
+    "\n",
+    "    print(tabulate(data, headers=['Class', 'Precision', 'Recall', 'F1-score', 'Support'], tablefmt='psql'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_prepared_data():\n",
+    "    train_set = pd.read_csv(CONSTANTS.TRAIN_SET_PATH)\n",
+    "    X_train = train_set['text']\n",
+    "    y_train = train_set['label']\n",
+    "    test_set = pd.read_csv(CONSTANTS.TEST_SET_PATH)\n",
+    "    X_test = test_set['text']\n",
+    "    y_test = test_set['label']    \n",
+    "    return X_train, X_test, y_train, y_test "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def evaluate_n_grams(n1, n2):\n",
+    "    # Read data\n",
+    "    X_train, X_test, y_train, y_test = get_prepared_data()\n",
+    "    # Apply TF_IDF\n",
+    "    vectorizer = TfidfVectorizer(ngram_range=(n1, n2))\n",
+    "    X_train = vectorizer.fit_transform(X_train)\n",
+    "    X_test = vectorizer.transform(X_test)\n",
+    "    # Train Naive bayes classifier\n",
+    "    classifier = MultinomialNB()\n",
+    "    classifier.fit(X_train, y_train)    \n",
+    "    # Evaluate model\n",
+    "        # training set\n",
+    "    y_train_pred = classifier.predict(X_train)\n",
+    "    train_accuracy = accuracy_score(y_train, y_train_pred)\n",
+    "\n",
+    "        # test set\n",
+    "    y_pred = classifier.predict(X_test)\n",
+    "    test_accuracy = accuracy_score(y_test, y_pred)\n",
+    "\n",
+    "    print(f'Train Accuracy: {train_accuracy}')\n",
+    "    print(f'Test Accuracy: {test_accuracy}')\n",
+    "    # Print classification report\n",
+    "    report = classification_report(y_test, y_pred, output_dict=True)\n",
+    "    print_clf_report_as_table(report)    \n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def evaluate_n_grams_stemming(n1, n2):\n",
+    "    # Read data\n",
+    "    X_train, X_test, y_train, y_test = get_prepared_data()\n",
+    "    # Apply stemming\n",
+    "    X_train = X_train.apply(stem_text)\n",
+    "    X_test = X_test.apply(stem_text)\n",
+    "    # Apply TF_IDF\n",
+    "    vectorizer = TfidfVectorizer(ngram_range=(n1, n2))\n",
+    "    X_train = vectorizer.fit_transform(X_train)\n",
+    "    X_test = vectorizer.transform(X_test)\n",
+    "    # Train Naive bayes classifier\n",
+    "    classifier = MultinomialNB()\n",
+    "    classifier.fit(X_train, y_train)    \n",
+    "    # Evaluate model\n",
+    "        # training set\n",
+    "    y_train_pred = classifier.predict(X_train)\n",
+    "    train_accuracy = accuracy_score(y_train, y_train_pred)\n",
+    "\n",
+    "        # test set\n",
+    "    y_pred = classifier.predict(X_test)\n",
+    "    test_accuracy = accuracy_score(y_test, y_pred)\n",
+    "\n",
+    "    print(f'Train Accuracy: {train_accuracy}')\n",
+    "    print(f'Test Accuracy: {test_accuracy}')\n",
+    "    # Print classification report\n",
+    "    report = classification_report(y_test, y_pred, output_dict=True)\n",
+    "    print_clf_report_as_table(report)    \n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## ***1- N-Grams, no unigram stemming***"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### ***1-2***"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Accuracy: 0.9985528219971056\n",
+      "Test Accuracy: 0.8982683982683982\n",
+      "+---------------------------------+--------------------+--------------------+------------+-----------+\n",
+      "| Class                           | Precision          | Recall             |   F1-score | Support   |\n",
+      "|---------------------------------+--------------------+--------------------+------------+-----------|\n",
+      "| acne                            | 0.95               | 1.0                |   0.974359 | 19.0      |\n",
+      "| allergy                         | 0.8823529411764706 | 0.7894736842105263 |   0.833333 | 19.0      |\n",
+      "| arthritis                       | 0.7777777777777778 | 1.0                |   0.875    | 14.0      |\n",
+      "| bronchial asthma                | 0.7391304347826086 | 1.0                |   0.85     | 17.0      |\n",
+      "| cervical spondylosis            | 1.0                | 1.0                |   1        | 21.0      |\n",
+      "| chicken pox                     | 0.8333333333333334 | 0.7894736842105263 |   0.810811 | 19.0      |\n",
+      "| common cold                     | 0.7272727272727273 | 0.8888888888888888 |   0.8      | 18.0      |\n",
+      "| dengue                          | 0.5185185185185185 | 0.875              |   0.651163 | 16.0      |\n",
+      "| diabetes                        | 1.0                | 0.631578947368421  |   0.774194 | 19.0      |\n",
+      "| dimorphic hemorrhoids           | 0.9444444444444444 | 1.0                |   0.971429 | 17.0      |\n",
+      "| drug reaction                   | 0.8666666666666667 | 0.8125             |   0.83871  | 16.0      |\n",
+      "| fungal infection                | 0.8181818181818182 | 1.0                |   0.9      | 18.0      |\n",
+      "| gastroesophageal reflux disease | 0.88               | 1.0                |   0.93617  | 22.0      |\n",
+      "| hypertension                    | 0.9473684210526315 | 1.0                |   0.972973 | 18.0      |\n",
+      "| impetigo                        | 1.0                | 0.92               |   0.958333 | 25.0      |\n",
+      "| jaundice                        | 1.0                | 1.0                |   1        | 17.0      |\n",
+      "| malaria                         | 1.0                | 1.0                |   1        | 23.0      |\n",
+      "| migraine                        | 1.0                | 0.8947368421052632 |   0.944444 | 19.0      |\n",
+      "| peptic ulcer disease            | 1.0                | 0.8636363636363636 |   0.926829 | 22.0      |\n",
+      "| pneumonia                       | 1.0                | 0.8333333333333334 |   0.909091 | 24.0      |\n",
+      "| psoriasis                       | 1.0                | 0.7727272727272727 |   0.871795 | 22.0      |\n",
+      "| typhoid                         | 1.0                | 0.625              |   0.769231 | 24.0      |\n",
+      "| urinary tract infection         | 0.9411764705882353 | 1.0                |   0.969697 | 16.0      |\n",
+      "| varicose veins                  | 1.0                | 1.0                |   1        | 17.0      |\n",
+      "| accuracy                        |                    |                    |   0.898268 |           |\n",
+      "| macro avg                       | 0.9094259814081346 | 0.9040145423533582 |   0.897398 |           |\n",
+      "| weighted avg                    | 0.9190699154565779 | 0.8982683982683982 |   0.899329 |           |\n",
+      "+---------------------------------+--------------------+--------------------+------------+-----------+\n"
+     ]
+    }
+   ],
+   "source": [
+    "evaluate_n_grams(1, 2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### ***1-3***"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Accuracy: 0.9985528219971056\n",
+      "Test Accuracy: 0.9025974025974026\n",
+      "+---------------------------------+--------------------+--------------------+------------+-----------+\n",
+      "| Class                           | Precision          | Recall             |   F1-score | Support   |\n",
+      "|---------------------------------+--------------------+--------------------+------------+-----------|\n",
+      "| acne                            | 0.95               | 1.0                |   0.974359 | 19.0      |\n",
+      "| allergy                         | 0.8333333333333334 | 0.7894736842105263 |   0.810811 | 19.0      |\n",
+      "| arthritis                       | 0.7777777777777778 | 1.0                |   0.875    | 14.0      |\n",
+      "| bronchial asthma                | 0.7083333333333334 | 1.0                |   0.829268 | 17.0      |\n",
+      "| cervical spondylosis            | 1.0                | 1.0                |   1        | 21.0      |\n",
+      "| chicken pox                     | 0.8421052631578947 | 0.8421052631578947 |   0.842105 | 19.0      |\n",
+      "| common cold                     | 0.7619047619047619 | 0.8888888888888888 |   0.820513 | 18.0      |\n",
+      "| dengue                          | 0.56               | 0.875              |   0.682927 | 16.0      |\n",
+      "| diabetes                        | 1.0                | 0.631578947368421  |   0.774194 | 19.0      |\n",
+      "| dimorphic hemorrhoids           | 1.0                | 1.0                |   1        | 17.0      |\n",
+      "| drug reaction                   | 0.9230769230769231 | 0.75               |   0.827586 | 16.0      |\n",
+      "| fungal infection                | 0.782608695652174  | 1.0                |   0.878049 | 18.0      |\n",
+      "| gastroesophageal reflux disease | 0.875              | 0.9545454545454546 |   0.913043 | 22.0      |\n",
+      "| hypertension                    | 0.9473684210526315 | 1.0                |   0.972973 | 18.0      |\n",
+      "| impetigo                        | 1.0                | 0.96               |   0.979592 | 25.0      |\n",
+      "| jaundice                        | 1.0                | 1.0                |   1        | 17.0      |\n",
+      "| malaria                         | 1.0                | 1.0                |   1        | 23.0      |\n",
+      "| migraine                        | 1.0                | 0.8947368421052632 |   0.944444 | 19.0      |\n",
+      "| peptic ulcer disease            | 1.0                | 0.8181818181818182 |   0.9      | 22.0      |\n",
+      "| pneumonia                       | 1.0                | 0.875              |   0.933333 | 24.0      |\n",
+      "| psoriasis                       | 1.0                | 0.8181818181818182 |   0.9      | 22.0      |\n",
+      "| typhoid                         | 1.0                | 0.6666666666666666 |   0.8      | 24.0      |\n",
+      "| urinary tract infection         | 0.9411764705882353 | 1.0                |   0.969697 | 16.0      |\n",
+      "| varicose veins                  | 1.0                | 1.0                |   1        | 17.0      |\n",
+      "| accuracy                        |                    |                    |   0.902597 |           |\n",
+      "| macro avg                       | 0.9126118741615444 | 0.9068483076377812 |   0.901162 |           |\n",
+      "| weighted avg                    | 0.9214411623430815 | 0.9025974025974026 |   0.90361  |           |\n",
+      "+---------------------------------+--------------------+--------------------+------------+-----------+\n"
+     ]
+    }
+   ],
+   "source": [
+    "evaluate_n_grams(1, 3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### ***1-4***"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Accuracy: 0.9985528219971056\n",
+      "Test Accuracy: 0.8982683982683982\n",
+      "+---------------------------------+--------------------+--------------------+------------+-----------+\n",
+      "| Class                           | Precision          | Recall             |   F1-score | Support   |\n",
+      "|---------------------------------+--------------------+--------------------+------------+-----------|\n",
+      "| acne                            | 0.95               | 1.0                |   0.974359 | 19.0      |\n",
+      "| allergy                         | 0.7894736842105263 | 0.7894736842105263 |   0.789474 | 19.0      |\n",
+      "| arthritis                       | 0.7777777777777778 | 1.0                |   0.875    | 14.0      |\n",
+      "| bronchial asthma                | 0.7083333333333334 | 1.0                |   0.829268 | 17.0      |\n",
+      "| cervical spondylosis            | 1.0                | 1.0                |   1        | 21.0      |\n",
+      "| chicken pox                     | 0.7894736842105263 | 0.7894736842105263 |   0.789474 | 19.0      |\n",
+      "| common cold                     | 0.7619047619047619 | 0.8888888888888888 |   0.820513 | 18.0      |\n",
+      "| dengue                          | 0.5416666666666666 | 0.8125             |   0.65     | 16.0      |\n",
+      "| diabetes                        | 1.0                | 0.7368421052631579 |   0.848485 | 19.0      |\n",
+      "| dimorphic hemorrhoids           | 1.0                | 1.0                |   1        | 17.0      |\n",
+      "| drug reaction                   | 0.9230769230769231 | 0.75               |   0.827586 | 16.0      |\n",
+      "| fungal infection                | 0.8571428571428571 | 1.0                |   0.923077 | 18.0      |\n",
+      "| gastroesophageal reflux disease | 0.875              | 0.9545454545454546 |   0.913043 | 22.0      |\n",
+      "| hypertension                    | 0.9473684210526315 | 1.0                |   0.972973 | 18.0      |\n",
+      "| impetigo                        | 1.0                | 0.96               |   0.979592 | 25.0      |\n",
+      "| jaundice                        | 1.0                | 1.0                |   1        | 17.0      |\n",
+      "| malaria                         | 1.0                | 1.0                |   1        | 23.0      |\n",
+      "| migraine                        | 1.0                | 0.8947368421052632 |   0.944444 | 19.0      |\n",
+      "| peptic ulcer disease            | 1.0                | 0.7727272727272727 |   0.871795 | 22.0      |\n",
+      "| pneumonia                       | 1.0                | 0.8333333333333334 |   0.909091 | 24.0      |\n",
+      "| psoriasis                       | 1.0                | 0.8181818181818182 |   0.9      | 22.0      |\n",
+      "| typhoid                         | 1.0                | 0.6666666666666666 |   0.8      | 24.0      |\n",
+      "| urinary tract infection         | 0.8888888888888888 | 1.0                |   0.941176 | 16.0      |\n",
+      "| varicose veins                  | 0.9444444444444444 | 1.0                |   0.971429 | 17.0      |\n",
+      "| accuracy                        |                    |                    |   0.898268 |           |\n",
+      "| macro avg                       | 0.9064396434462223 | 0.9028070729222044 |   0.897116 |           |\n",
+      "| weighted avg                    | 0.9158868387251846 | 0.8982683982683982 |   0.899595 |           |\n",
+      "+---------------------------------+--------------------+--------------------+------------+-----------+\n"
+     ]
+    }
+   ],
+   "source": [
+    "evaluate_n_grams(1, 4)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### ***2-3***"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Accuracy: 0.9971056439942113\n",
+      "Test Accuracy: 0.8831168831168831\n",
+      "+---------------------------------+--------------------+--------------------+------------+-----------+\n",
+      "| Class                           | Precision          | Recall             |   F1-score | Support   |\n",
+      "|---------------------------------+--------------------+--------------------+------------+-----------|\n",
+      "| acne                            | 0.9473684210526315 | 0.9473684210526315 |   0.947368 | 19.0      |\n",
+      "| allergy                         | 0.7                | 0.7368421052631579 |   0.717949 | 19.0      |\n",
+      "| arthritis                       | 0.8235294117647058 | 1.0                |   0.903226 | 14.0      |\n",
+      "| bronchial asthma                | 0.7083333333333334 | 1.0                |   0.829268 | 17.0      |\n",
+      "| cervical spondylosis            | 0.9545454545454546 | 1.0                |   0.976744 | 21.0      |\n",
+      "| chicken pox                     | 0.7727272727272727 | 0.8947368421052632 |   0.829268 | 19.0      |\n",
+      "| common cold                     | 0.7619047619047619 | 0.8888888888888888 |   0.820513 | 18.0      |\n",
+      "| dengue                          | 0.5416666666666666 | 0.8125             |   0.65     | 16.0      |\n",
+      "| diabetes                        | 1.0                | 0.631578947368421  |   0.774194 | 19.0      |\n",
+      "| dimorphic hemorrhoids           | 0.9444444444444444 | 1.0                |   0.971429 | 17.0      |\n",
+      "| drug reaction                   | 0.9166666666666666 | 0.6875             |   0.785714 | 16.0      |\n",
+      "| fungal infection                | 0.8571428571428571 | 1.0                |   0.923077 | 18.0      |\n",
+      "| gastroesophageal reflux disease | 1.0                | 0.8636363636363636 |   0.926829 | 22.0      |\n",
+      "| hypertension                    | 0.8571428571428571 | 1.0                |   0.923077 | 18.0      |\n",
+      "| impetigo                        | 1.0                | 0.96               |   0.979592 | 25.0      |\n",
+      "| jaundice                        | 0.9444444444444444 | 1.0                |   0.971429 | 17.0      |\n",
+      "| malaria                         | 1.0                | 0.9565217391304348 |   0.977778 | 23.0      |\n",
+      "| migraine                        | 0.8947368421052632 | 0.8947368421052632 |   0.894737 | 19.0      |\n",
+      "| peptic ulcer disease            | 1.0                | 0.7727272727272727 |   0.871795 | 22.0      |\n",
+      "| pneumonia                       | 1.0                | 0.875              |   0.933333 | 24.0      |\n",
+      "| psoriasis                       | 1.0                | 0.8181818181818182 |   0.9      | 22.0      |\n",
+      "| typhoid                         | 1.0                | 0.625              |   0.769231 | 24.0      |\n",
+      "| urinary tract infection         | 0.7894736842105263 | 0.9375             |   0.857143 | 16.0      |\n",
+      "| varicose veins                  | 1.0                | 1.0                |   1        | 17.0      |\n",
+      "| accuracy                        |                    |                    |   0.883117 |           |\n",
+      "| macro avg                       | 0.8922552965896621 | 0.8876133016858132 |   0.880571 |           |\n",
+      "| weighted avg                    | 0.9031294439867679 | 0.8831168831168831 |   0.883735 |           |\n",
+      "+---------------------------------+--------------------+--------------------+------------+-----------+\n"
+     ]
+    }
+   ],
+   "source": [
+    "evaluate_n_grams(2, 3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## ***1- N-Grams, stemming text***"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### ***1-2***"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Accuracy: 1.0\n",
+      "Test Accuracy: 0.8939393939393939\n",
+      "+---------------------------------+--------------------+--------------------+------------+-----------+\n",
+      "| Class                           | Precision          | Recall             |   F1-score | Support   |\n",
+      "|---------------------------------+--------------------+--------------------+------------+-----------|\n",
+      "| acne                            | 0.95               | 1.0                |   0.974359 | 19.0      |\n",
+      "| allergy                         | 0.8888888888888888 | 0.8421052631578947 |   0.864865 | 19.0      |\n",
+      "| arthritis                       | 0.8235294117647058 | 1.0                |   0.903226 | 14.0      |\n",
+      "| bronchial asthma                | 0.7727272727272727 | 1.0                |   0.871795 | 17.0      |\n",
+      "| cervical spondylosis            | 1.0                | 1.0                |   1        | 21.0      |\n",
+      "| chicken pox                     | 0.8333333333333334 | 0.7894736842105263 |   0.810811 | 19.0      |\n",
+      "| common cold                     | 0.7272727272727273 | 0.8888888888888888 |   0.8      | 18.0      |\n",
+      "| dengue                          | 0.5185185185185185 | 0.875              |   0.651163 | 16.0      |\n",
+      "| diabetes                        | 1.0                | 0.5789473684210527 |   0.733333 | 19.0      |\n",
+      "| dimorphic hemorrhoids           | 0.8947368421052632 | 1.0                |   0.944444 | 17.0      |\n",
+      "| drug reaction                   | 0.8125             | 0.8125             |   0.8125   | 16.0      |\n",
+      "| fungal infection                | 0.75               | 1.0                |   0.857143 | 18.0      |\n",
+      "| gastroesophageal reflux disease | 0.88               | 1.0                |   0.93617  | 22.0      |\n",
+      "| hypertension                    | 0.9473684210526315 | 1.0                |   0.972973 | 18.0      |\n",
+      "| impetigo                        | 1.0                | 0.92               |   0.958333 | 25.0      |\n",
+      "| jaundice                        | 1.0                | 1.0                |   1        | 17.0      |\n",
+      "| malaria                         | 1.0                | 1.0                |   1        | 23.0      |\n",
+      "| migraine                        | 1.0                | 0.8947368421052632 |   0.944444 | 19.0      |\n",
+      "| peptic ulcer disease            | 1.0                | 0.7727272727272727 |   0.871795 | 22.0      |\n",
+      "| pneumonia                       | 1.0                | 0.8333333333333334 |   0.909091 | 24.0      |\n",
+      "| psoriasis                       | 1.0                | 0.7727272727272727 |   0.871795 | 22.0      |\n",
+      "| typhoid                         | 1.0                | 0.625              |   0.769231 | 24.0      |\n",
+      "| urinary tract infection         | 0.9411764705882353 | 1.0                |   0.969697 | 16.0      |\n",
+      "| varicose veins                  | 1.0                | 1.0                |   1        | 17.0      |\n",
+      "| accuracy                        |                    |                    |   0.893939 |           |\n",
+      "| macro avg                       | 0.9058354952604825 | 0.9002266635654793 |   0.892799 |           |\n",
+      "| weighted avg                    | 0.9155999663087571 | 0.8939393939393939 |   0.894412 |           |\n",
+      "+---------------------------------+--------------------+--------------------+------------+-----------+\n"
+     ]
+    }
+   ],
+   "source": [
+    "evaluate_n_grams_stemming(1, 2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### ***1-3***"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Accuracy: 1.0\n",
+      "Test Accuracy: 0.8939393939393939\n",
+      "+---------------------------------+--------------------+--------------------+------------+-----------+\n",
+      "| Class                           | Precision          | Recall             |   F1-score | Support   |\n",
+      "|---------------------------------+--------------------+--------------------+------------+-----------|\n",
+      "| acne                            | 0.95               | 1.0                |   0.974359 | 19.0      |\n",
+      "| allergy                         | 0.75               | 0.7894736842105263 |   0.769231 | 19.0      |\n",
+      "| arthritis                       | 0.7777777777777778 | 1.0                |   0.875    | 14.0      |\n",
+      "| bronchial asthma                | 0.7727272727272727 | 1.0                |   0.871795 | 17.0      |\n",
+      "| cervical spondylosis            | 1.0                | 1.0                |   1        | 21.0      |\n",
+      "| chicken pox                     | 0.8333333333333334 | 0.7894736842105263 |   0.810811 | 19.0      |\n",
+      "| common cold                     | 0.7272727272727273 | 0.8888888888888888 |   0.8      | 18.0      |\n",
+      "| dengue                          | 0.5185185185185185 | 0.875              |   0.651163 | 16.0      |\n",
+      "| diabetes                        | 1.0                | 0.5789473684210527 |   0.733333 | 19.0      |\n",
+      "| dimorphic hemorrhoids           | 0.9444444444444444 | 1.0                |   0.971429 | 17.0      |\n",
+      "| drug reaction                   | 1.0                | 0.75               |   0.857143 | 16.0      |\n",
+      "| fungal infection                | 0.75               | 1.0                |   0.857143 | 18.0      |\n",
+      "| gastroesophageal reflux disease | 0.875              | 0.9545454545454546 |   0.913043 | 22.0      |\n",
+      "| hypertension                    | 0.9473684210526315 | 1.0                |   0.972973 | 18.0      |\n",
+      "| impetigo                        | 1.0                | 0.92               |   0.958333 | 25.0      |\n",
+      "| jaundice                        | 1.0                | 1.0                |   1        | 17.0      |\n",
+      "| malaria                         | 1.0                | 1.0                |   1        | 23.0      |\n",
+      "| migraine                        | 1.0                | 0.8947368421052632 |   0.944444 | 19.0      |\n",
+      "| peptic ulcer disease            | 1.0                | 0.8181818181818182 |   0.9      | 22.0      |\n",
+      "| pneumonia                       | 1.0                | 0.875              |   0.933333 | 24.0      |\n",
+      "| psoriasis                       | 1.0                | 0.8181818181818182 |   0.9      | 22.0      |\n",
+      "| typhoid                         | 1.0                | 0.625              |   0.769231 | 24.0      |\n",
+      "| urinary tract infection         | 0.9411764705882353 | 1.0                |   0.969697 | 16.0      |\n",
+      "| varicose veins                  | 1.0                | 1.0                |   1        | 17.0      |\n",
+      "| accuracy                        |                    |                    |   0.893939 |           |\n",
+      "| macro avg                       | 0.9078174569047892 | 0.899059564947723  |   0.893019 |           |\n",
+      "| weighted avg                    | 0.9165861513197099 | 0.8939393939393939 |   0.895007 |           |\n",
+      "+---------------------------------+--------------------+--------------------+------------+-----------+\n"
+     ]
+    }
+   ],
+   "source": [
+    "evaluate_n_grams_stemming(1, 3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### ***1-4***"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Accuracy: 1.0\n",
+      "Test Accuracy: 0.8961038961038961\n",
+      "+---------------------------------+--------------------+--------------------+------------+-----------+\n",
+      "| Class                           | Precision          | Recall             |   F1-score | Support   |\n",
+      "|---------------------------------+--------------------+--------------------+------------+-----------|\n",
+      "| acne                            | 0.95               | 1.0                |   0.974359 | 19.0      |\n",
+      "| allergy                         | 0.75               | 0.7894736842105263 |   0.769231 | 19.0      |\n",
+      "| arthritis                       | 0.7777777777777778 | 1.0                |   0.875    | 14.0      |\n",
+      "| bronchial asthma                | 0.7727272727272727 | 1.0                |   0.871795 | 17.0      |\n",
+      "| cervical spondylosis            | 1.0                | 1.0                |   1        | 21.0      |\n",
+      "| chicken pox                     | 0.8333333333333334 | 0.7894736842105263 |   0.810811 | 19.0      |\n",
+      "| common cold                     | 0.7272727272727273 | 0.8888888888888888 |   0.8      | 18.0      |\n",
+      "| dengue                          | 0.5185185185185185 | 0.875              |   0.651163 | 16.0      |\n",
+      "| diabetes                        | 1.0                | 0.6842105263157895 |   0.8125   | 19.0      |\n",
+      "| dimorphic hemorrhoids           | 0.9444444444444444 | 1.0                |   0.971429 | 17.0      |\n",
+      "| drug reaction                   | 1.0                | 0.75               |   0.857143 | 16.0      |\n",
+      "| fungal infection                | 0.8181818181818182 | 1.0                |   0.9      | 18.0      |\n",
+      "| gastroesophageal reflux disease | 0.875              | 0.9545454545454546 |   0.913043 | 22.0      |\n",
+      "| hypertension                    | 0.9473684210526315 | 1.0                |   0.972973 | 18.0      |\n",
+      "| impetigo                        | 1.0                | 0.92               |   0.958333 | 25.0      |\n",
+      "| jaundice                        | 1.0                | 1.0                |   1        | 17.0      |\n",
+      "| malaria                         | 1.0                | 1.0                |   1        | 23.0      |\n",
+      "| migraine                        | 1.0                | 0.8947368421052632 |   0.944444 | 19.0      |\n",
+      "| peptic ulcer disease            | 1.0                | 0.8181818181818182 |   0.9      | 22.0      |\n",
+      "| pneumonia                       | 1.0                | 0.875              |   0.933333 | 24.0      |\n",
+      "| psoriasis                       | 1.0                | 0.8181818181818182 |   0.9      | 22.0      |\n",
+      "| typhoid                         | 1.0                | 0.5833333333333334 |   0.736842 | 24.0      |\n",
+      "| urinary tract infection         | 0.9411764705882353 | 1.0                |   0.969697 | 16.0      |\n",
+      "| varicose veins                  | 0.9444444444444444 | 1.0                |   0.971429 | 17.0      |\n",
+      "| accuracy                        |                    |                    |   0.896104 |           |\n",
+      "| macro avg                       | 0.9083435511808835 | 0.9017094187488924 |   0.895564 |           |\n",
+      "| weighted avg                    | 0.9171983337500741 | 0.8961038961038961 |   0.897198 |           |\n",
+      "+---------------------------------+--------------------+--------------------+------------+-----------+\n"
+     ]
+    }
+   ],
+   "source": [
+    "evaluate_n_grams_stemming(1, 4)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### ***2-3***"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Accuracy: 0.9971056439942113\n",
+      "Test Accuracy: 0.8766233766233766\n",
+      "+---------------------------------+--------------------+--------------------+------------+-----------+\n",
+      "| Class                           | Precision          | Recall             |   F1-score | Support   |\n",
+      "|---------------------------------+--------------------+--------------------+------------+-----------|\n",
+      "| acne                            | 0.9473684210526315 | 0.9473684210526315 |   0.947368 | 19.0      |\n",
+      "| allergy                         | 0.7142857142857143 | 0.7894736842105263 |   0.75     | 19.0      |\n",
+      "| arthritis                       | 0.8235294117647058 | 1.0                |   0.903226 | 14.0      |\n",
+      "| bronchial asthma                | 0.7083333333333334 | 1.0                |   0.829268 | 17.0      |\n",
+      "| cervical spondylosis            | 0.9545454545454546 | 1.0                |   0.976744 | 21.0      |\n",
+      "| chicken pox                     | 0.8333333333333334 | 0.7894736842105263 |   0.810811 | 19.0      |\n",
+      "| common cold                     | 0.7272727272727273 | 0.8888888888888888 |   0.8      | 18.0      |\n",
+      "| dengue                          | 0.5                | 0.875              |   0.636364 | 16.0      |\n",
+      "| diabetes                        | 1.0                | 0.5789473684210527 |   0.733333 | 19.0      |\n",
+      "| dimorphic hemorrhoids           | 0.9444444444444444 | 1.0                |   0.971429 | 17.0      |\n",
+      "| drug reaction                   | 0.9230769230769231 | 0.75               |   0.827586 | 16.0      |\n",
+      "| fungal infection                | 0.782608695652174  | 1.0                |   0.878049 | 18.0      |\n",
+      "| gastroesophageal reflux disease | 0.9047619047619048 | 0.8636363636363636 |   0.883721 | 22.0      |\n",
+      "| hypertension                    | 0.9                | 1.0                |   0.947368 | 18.0      |\n",
+      "| impetigo                        | 1.0                | 0.92               |   0.958333 | 25.0      |\n",
+      "| jaundice                        | 1.0                | 1.0                |   1        | 17.0      |\n",
+      "| malaria                         | 1.0                | 0.9565217391304348 |   0.977778 | 23.0      |\n",
+      "| migraine                        | 0.9444444444444444 | 0.8947368421052632 |   0.918919 | 19.0      |\n",
+      "| peptic ulcer disease            | 1.0                | 0.7272727272727273 |   0.842105 | 22.0      |\n",
+      "| pneumonia                       | 1.0                | 0.875              |   0.933333 | 24.0      |\n",
+      "| psoriasis                       | 1.0                | 0.8181818181818182 |   0.9      | 22.0      |\n",
+      "| typhoid                         | 1.0                | 0.5416666666666666 |   0.702703 | 24.0      |\n",
+      "| urinary tract infection         | 0.8421052631578947 | 1.0                |   0.914286 | 16.0      |\n",
+      "| varicose veins                  | 0.9444444444444444 | 1.0                |   0.971429 | 17.0      |\n",
+      "| accuracy                        |                    |                    |   0.876623 |           |\n",
+      "| macro avg                       | 0.8914397714820886 | 0.8840070084907042 |   0.87559  |           |\n",
+      "| weighted avg                    | 0.9017367812430991 | 0.8766233766233766 |   0.876885 |           |\n",
+      "+---------------------------------+--------------------+--------------------+------------+-----------+\n"
+     ]
+    }
+   ],
+   "source": [
+    "evaluate_n_grams_stemming(2, 3)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.20"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/conda_nlp_environment.yml
+++ b/conda_nlp_environment.yml
 name: NLP
 channels:
+  - conda-forge
  - defaults
 dependencies:
  - annotated-types=0.6.0=py39haa95532_0
@@ -15,10 +16,11 @@ dependencies:
  - blas=1.0=mkl
  - bleach=4.1.0=pyhd3eb1b0_0
  - bottleneck=1.3.7=py39h9128911_0
+  - brotli=1.0.9=ha925a31_2
  - brotli-python=1.0.9=py39hd77b12b_8
-  - ca-certificates=2024.9.24=haa95532_0
+  - ca-certificates=2024.8.30=h56e8100_0
  - catalogue=2.0.10=py39haa95532_0
-  - certifi=2024.8.30=py39haa95532_0
+  - certifi=2024.8.30=pyhd8ed1ab_0
  - cffi=1.17.1=py39h827c3e9_0
  - charset-normalizer=3.3.2=pyhd3eb1b0_0
  - click=8.1.7=py39haa95532_0
@@ -26,6 +28,8 @@ dependencies:
  - colorama=0.4.6=py39haa95532_0
  - comm=0.2.1=py39haa95532_0
  - confection=0.1.4=py39h9909e9c_0
+  - contourpy=1.2.0=py39h59b6b97_0
+  - cycler=0.12.1=pyhd8ed1ab_0
  - cymem=2.0.6=py39hd77b12b_0
  - cython-blis=0.7.9=py39h080aedc_0
  - debugpy=1.6.7=py39hd77b12b_0
@@ -33,6 +37,8 @@ dependencies:
  - defusedxml=0.7.1=pyhd3eb1b0_0
  - exceptiongroup=1.2.0=py39haa95532_0
  - executing=0.8.3=pyhd3eb1b0_0
+  - fonttools=4.25.0=pyhd3eb1b0_0
+  - freetype=2.10.4=h546665d_1
  - h11=0.14.0=py39haa95532_0
  - httpcore=1.0.2=py39haa95532_0
  - httpx=0.27.0=py39haa95532_0
@@ -40,6 +46,7 @@ dependencies:
  - idna=3.7=py39haa95532_0
  - importlib-metadata=7.0.1=py39haa95532_0
  - importlib_metadata=7.0.1=hd3eb1b0_0
+  - importlib_resources=6.4.5=pyhd8ed1ab_0
  - intel-openmp=2023.1.0=h59b6b97_46320
  - ipykernel=6.29.5=py39haa95532_0
  - ipython=8.15.0=py39haa95532_0
@@ -63,16 +70,23 @@ dependencies:
  - jupyterlab_pygments=0.1.2=py_0
  - jupyterlab_server=2.27.3=py39haa95532_0
  - jupyterlab_widgets=3.0.10=py39haa95532_0
+  - kiwisolver=1.4.4=py39hd77b12b_0
  - krb5=1.20.1=h5b6d351_0
  - langcodes=3.3.0=pyhd3eb1b0_0
+  - lcms2=2.12=h83e58a3_0
+  - lerc=3.0=hd77b12b_0
  - libclang=14.0.6=default_hb5a9fac_1
  - libclang13=14.0.6=default_h8e68704_1
+  - libdeflate=1.17=h2bbff1b_1
  - libpng=1.6.39=h8cc25b3_0
  - libpq=12.17=h906ac69_0
  - libsodium=1.0.18=h62dcd97_0
+  - libtiff=4.5.1=hd77b12b_0
+  - libwebp-base=1.3.2=h3d04722_1
  - lz4-c=1.9.4=h2bbff1b_1
  - markdown-it-py=2.2.0=py39haa95532_1
  - markupsafe=2.1.3=py39h2bbff1b_0
+  - matplotlib-base=3.9.2=py39he19b0ae_0
  - matplotlib-inline=0.1.6=py39haa95532_0
  - mdurl=0.1.0=py39haa95532_0
  - mistune=2.0.4=py39haa95532_0
@@ -80,6 +94,7 @@ dependencies:
  - mkl-service=2.4.0=py39h2bbff1b_1
  - mkl_fft=1.3.10=py39h827c3e9_0
  - mkl_random=1.2.7=py39hc64d2fc_0
+  - munkres=1.1.4=pyh9f0ad1d_0
  - murmurhash=1.0.7=py39hd77b12b_0
  - nbclient=0.8.0=py39haa95532_0
  - nbconvert=7.16.4=py39haa95532_0
@@ -91,6 +106,7 @@ dependencies:
  - numexpr=2.10.1=py39h4cd664f_0
  - numpy=1.26.4=py39h055cbcc_0
  - numpy-base=1.26.4=py39h65a83cf_0
+  - openjpeg=2.5.2=hae555c5_0
  - openssl=3.0.15=h827c3e9_0
  - overrides=7.4.0=py39haa95532_0
  - packaging=24.1=py39haa95532_0
@@ -98,6 +114,8 @@ dependencies:
  - pandocfilters=1.5.0=pyhd3eb1b0_0
  - parso=0.8.3=pyhd3eb1b0_0
  - pickleshare=0.7.5=pyhd3eb1b0_1003
+  - pillow=10.4.0=py39h827c3e9_0
+  - pip=24.3.1=pyh8b19718_0
  - platformdirs=3.10.0=py39haa95532_0
  - ply=3.11=py39haa95532_0
  - preshed=3.0.6=py39h6c2663c_0
@@ -110,6 +128,7 @@ dependencies:
  - pydantic=2.8.2=py39haa95532_0
  - pydantic-core=2.20.1=py39hefb1915_0
  - pygments=2.15.1=py39haa95532_1
+  - pyparsing=3.2.0=pyhd8ed1ab_1
  - pyqt=5.15.10=py39hd77b12b_0
  - pyqt5-sip=12.13.0=py39h2bbff1b_0
  - pysocks=1.7.1=py39haa95532_0
@@ -172,6 +191,7 @@ dependencies:
  - widgetsnbextension=4.0.10=py39haa95532_0
  - win_inet_pton=1.1.0=py39haa95532_0
  - winpty=0.4.3=4
+  - wordcloud=1.9.3=py39h2bbff1b_0
  - xz=5.4.6=h8cc25b3_1
  - yaml=0.2.5=he774522_0
  - zeromq=4.3.5=hd77b12b_0
@@ -184,7 +204,6 @@ dependencies:
      - emoji==2.14.0
      - gensim==4.3.3
      - huggingface-hub==0.26.2
-      - pip==24.3.1
      - pyahocorasick==2.1.0
      - safetensors==0.4.5
      - scikit-learn==1.5.2