(3) Add lemmatizer

ba4e54ec · Almouhannad Hafez · d238e02f · ba4e54ec · ba4e54ec
Commit ba4e54ec authored Nov 02, 2024 by Almouhannad Hafez
Hide whitespace changes
Inline Side-by-side

Showing with 440 additions and 1 deletion

3.1.Lemmatizer.ipynb 3.1.Lemmatizer.ipynb +437 -0

conda_nlp_environment.yml conda_nlp_environment.yml +3 -1

No files found.
--- a/3.1.Lemmatizer.ipynb
+++ b/3.1.Lemmatizer.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# ***Setup***"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import nltk\n",
+    "from nltk.tokenize import word_tokenize\n",
+    "from nltk.stem import WordNetLemmatizer\n",
+    "from nltk import pos_tag\n",
+    "from nltk.corpus import stopwords\n",
+    "\n",
+    "import spacy\n",
+    "\n",
+    "import stanza\n",
+    "\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from sklearn.naive_bayes import MultinomialNB\n",
+    "from sklearn.metrics import accuracy_score, classification_report\n",
+    "\n",
+    "from tabulate import tabulate\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "from constants import CONSTANTS"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Download necessary resources**  \n",
+    "**Uncomment if you haven't already**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# nltk.download('wordnet')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# nltk.download('averaged_perceptron_tagger')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# nltk.download('averaged_perceptron_tagger_eng')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# stanza.download('en')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "nlp_spacy = spacy.load('en_core_web_sm')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024-11-02 21:15:19 WARNING: Language en package default expects mwt, which has been added\n",
+      "2024-11-02 21:15:19 INFO: Loading these models for language: en (English):\n",
+      "=================================\n",
+      "| Processor | Package           |\n",
+      "---------------------------------\n",
+      "| tokenize  | combined          |\n",
+      "| mwt       | combined          |\n",
+      "| pos       | combined_charlm   |\n",
+      "| lemma     | combined_nocharlm |\n",
+      "=================================\n",
+      "\n",
+      "2024-11-02 21:15:19 INFO: Using device: cpu\n",
+      "2024-11-02 21:15:19 INFO: Loading: tokenize\n",
+      "2024-11-02 21:15:21 INFO: Loading: mwt\n",
+      "2024-11-02 21:15:21 INFO: Loading: pos\n",
+      "2024-11-02 21:15:22 INFO: Loading: lemma\n",
+      "2024-11-02 21:15:22 INFO: Done loading processors!\n"
+     ]
+    }
+   ],
+   "source": [
+    "# nlp_stanza = stanza.Pipeline('en', download_method=None)\n",
+    "nlp_stanza = stanza.Pipeline('en', processors='tokenize,pos,lemma', download_method=None)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "***Some helper functions***"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "lemmatizer_nltk = WordNetLemmatizer()\n",
+    "\n",
+    "def get_wordnet_pos(treebank_tag):\n",
+    "    if treebank_tag.startswith('J'):\n",
+    "        return 'a'  # Adjective\n",
+    "    elif treebank_tag.startswith('V'):\n",
+    "        return 'v'  # Verb\n",
+    "    elif treebank_tag.startswith('N'):\n",
+    "        return 'n'  # Noun\n",
+    "    elif treebank_tag.startswith('R'):\n",
+    "        return 'r'  # Adverb\n",
+    "    else:\n",
+    "        return None\n",
+    "\n",
+    "def lemmatize_text_nltk(text):\n",
+    "    tokens = word_tokenize(text)\n",
+    "    pos_tags = pos_tag(tokens)  # Get POS tags for the tokens\n",
+    "    lemmatized_tokens = []\n",
+    "    \n",
+    "    for token, tag in pos_tags:\n",
+    "        wordnet_pos = get_wordnet_pos(tag)  # Convert to WordNet POS\n",
+    "        if wordnet_pos:  # If there's a valid POS tag\n",
+    "            lemmatized_token = lemmatizer_nltk.lemmatize(token, pos=wordnet_pos)\n",
+    "        else:\n",
+    "            lemmatized_token = lemmatizer_nltk.lemmatize(token)\n",
+    "        lemmatized_tokens.append(lemmatized_token)\n",
+    "    \n",
+    "    return ' '.join(lemmatized_tokens)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def lemmatize_text_spacy(text):\n",
+    "    doc = nlp_spacy(text)  # Process the text with SpaCy\n",
+    "    lemmatized_tokens = [token.lemma_ for token in doc]  # Apply lemmatization\n",
+    "    return ' '.join(lemmatized_tokens)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def lemmatize_text_stanza(text):\n",
+    "    doc = nlp_stanza(text)  # Process the text with Stanza\n",
+    "    lemmatized_tokens = [word.lemma for sentence in doc.sentences for word in sentence.words]  # Apply lemmatization\n",
+    "    return ' '.join(lemmatized_tokens)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "stop_words = set(stopwords.words('english'))\n",
+    "\n",
+    "def remove_stopwords(text):\n",
+    "    tokens = word_tokenize(text)  # Tokenize the text\n",
+    "    filtered_tokens = [word for word in tokens if word not in stop_words]  # Remove stop words\n",
+    "    return ' '.join(filtered_tokens)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def print_clf_report_as_table(report):\n",
+    "    data = []\n",
+    "    for key, value in report.items():\n",
+    "        if key != 'accuracy' and key != 'macro avg' and key != 'weighted avg':\n",
+    "            data.append([key, value['precision'], value['recall'], value['f1-score'], value['support']])\n",
+    "\n",
+    "    data.append(['accuracy', '', '', report['accuracy'], ''])\n",
+    "\n",
+    "    data.append(['macro avg', report['macro avg']['precision'], report['macro avg']['recall'], report['macro avg']['f1-score'], ''])\n",
+    "\n",
+    "    data.append(['weighted avg', report['weighted avg']['precision'], report['weighted avg']['recall'], report['weighted avg']['f1-score'], ''])\n",
+    "\n",
+    "    print(tabulate(data, headers=['Class', 'Precision', 'Recall', 'F1-score', 'Support'], tablefmt='psql'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_prepared_data():\n",
+    "    train_set = pd.read_csv(CONSTANTS.TRAIN_SET_PATH)\n",
+    "    X_train = train_set['text']\n",
+    "    y_train = train_set['label']\n",
+    "    test_set = pd.read_csv(CONSTANTS.TEST_SET_PATH)\n",
+    "    X_test = test_set['text']\n",
+    "    y_test = test_set['label']    \n",
+    "    X_train = X_train.apply(remove_stopwords)\n",
+    "    X_test = X_test.apply(remove_stopwords)   \n",
+    "    return X_train, X_test, y_train, y_test "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def evaluate_method(lemmatizer):\n",
+    "    # Read data\n",
+    "    X_train, X_test, y_train, y_test = get_prepared_data()\n",
+    "    # Apply lemmatization\n",
+    "    X_train = X_train.apply(lemmatizer)\n",
+    "    X_test = X_test.apply(lemmatizer)\n",
+    "    # Apply TF_IDF\n",
+    "    vectorizer = TfidfVectorizer()\n",
+    "    X_train = vectorizer.fit_transform(X_train)\n",
+    "    X_test = vectorizer.transform(X_test)\n",
+    "    # Train Naive bayes classifier\n",
+    "    classifier = MultinomialNB()\n",
+    "    classifier.fit(X_train, y_train)    \n",
+    "    # Evaluate model\n",
+    "        # training set\n",
+    "    y_train_pred = classifier.predict(X_train)\n",
+    "    train_accuracy = accuracy_score(y_train, y_train_pred)\n",
+    "\n",
+    "        # test set\n",
+    "    y_pred = classifier.predict(X_test)\n",
+    "    test_accuracy = accuracy_score(y_test, y_pred)\n",
+    "\n",
+    "    print(f'Train Accuracy: {train_accuracy}')\n",
+    "    print(f'Test Accuracy: {test_accuracy}')\n",
+    "    # Print classification report\n",
+    "    report = classification_report(y_test, y_pred, output_dict=True)\n",
+    "    print_clf_report_as_table(report)    \n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# ***1- Nltk lemmatizer***"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Note:** This lemmatizer uses POS tagging"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Accuracy: 0.9942112879884226\n",
+      "Test Accuracy: 0.9242424242424242\n",
+      "+---------------------------------+--------------------+--------------------+------------+-----------+\n",
+      "| Class                           | Precision          | Recall             |   F1-score | Support   |\n",
+      "|---------------------------------+--------------------+--------------------+------------+-----------|\n",
+      "| acne                            | 0.95               | 1.0                |   0.974359 | 19.0      |\n",
+      "| allergy                         | 0.9444444444444444 | 0.8947368421052632 |   0.918919 | 19.0      |\n",
+      "| arthritis                       | 0.875              | 1.0                |   0.933333 | 14.0      |\n",
+      "| bronchial asthma                | 0.8095238095238095 | 1.0                |   0.894737 | 17.0      |\n",
+      "| cervical spondylosis            | 1.0                | 1.0                |   1        | 21.0      |\n",
+      "| chicken pox                     | 0.8333333333333334 | 0.7894736842105263 |   0.810811 | 19.0      |\n",
+      "| common cold                     | 0.8421052631578947 | 0.8888888888888888 |   0.864865 | 18.0      |\n",
+      "| dengue                          | 0.5833333333333334 | 0.875              |   0.7      | 16.0      |\n",
+      "| diabetes                        | 1.0                | 0.6842105263157895 |   0.8125   | 19.0      |\n",
+      "| dimorphic hemorrhoids           | 1.0                | 1.0                |   1        | 17.0      |\n",
+      "| drug reaction                   | 0.8333333333333334 | 0.9375             |   0.882353 | 16.0      |\n",
+      "| fungal infection                | 0.9                | 1.0                |   0.947368 | 18.0      |\n",
+      "| gastroesophageal reflux disease | 0.8461538461538461 | 1.0                |   0.916667 | 22.0      |\n",
+      "| hypertension                    | 0.9473684210526315 | 1.0                |   0.972973 | 18.0      |\n",
+      "| impetigo                        | 1.0                | 0.96               |   0.979592 | 25.0      |\n",
+      "| jaundice                        | 1.0                | 1.0                |   1        | 17.0      |\n",
+      "| malaria                         | 1.0                | 1.0                |   1        | 23.0      |\n",
+      "| migraine                        | 1.0                | 0.8947368421052632 |   0.944444 | 19.0      |\n",
+      "| peptic ulcer disease            | 1.0                | 0.8181818181818182 |   0.9      | 22.0      |\n",
+      "| pneumonia                       | 1.0                | 0.9166666666666666 |   0.956522 | 24.0      |\n",
+      "| psoriasis                       | 1.0                | 0.8181818181818182 |   0.9      | 22.0      |\n",
+      "| typhoid                         | 1.0                | 0.7916666666666666 |   0.883721 | 24.0      |\n",
+      "| urinary tract infection         | 0.9411764705882353 | 1.0                |   0.969697 | 16.0      |\n",
+      "| varicose veins                  | 1.0                | 1.0                |   1        | 17.0      |\n",
+      "| accuracy                        |                    |                    |   0.924242 |           |\n",
+      "| macro avg                       | 0.9294071772883692 | 0.9278851563884457 |   0.923453 |           |\n",
+      "| weighted avg                    | 0.9363443369414178 | 0.9242424242424242 |   0.925164 |           |\n",
+      "+---------------------------------+--------------------+--------------------+------------+-----------+\n"
+     ]
+    }
+   ],
+   "source": [
+    "evaluate_method(lemmatize_text_nltk)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# ***2- Stanza lemmatizer***"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "evaluate_method(lemmatize_text_stanza)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# ***3- SpaCy lemmatizer***"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Accuracy: 0.9956584659913169\n",
+      "Test Accuracy: 0.9285714285714286\n",
+      "+---------------------------------+--------------------+--------------------+------------+-----------+\n",
+      "| Class                           | Precision          | Recall             |   F1-score | Support   |\n",
+      "|---------------------------------+--------------------+--------------------+------------+-----------|\n",
+      "| acne                            | 0.95               | 1.0                |   0.974359 | 19.0      |\n",
+      "| allergy                         | 0.7894736842105263 | 0.7894736842105263 |   0.789474 | 19.0      |\n",
+      "| arthritis                       | 0.875              | 1.0                |   0.933333 | 14.0      |\n",
+      "| bronchial asthma                | 0.7391304347826086 | 1.0                |   0.85     | 17.0      |\n",
+      "| cervical spondylosis            | 1.0                | 1.0                |   1        | 21.0      |\n",
+      "| chicken pox                     | 0.8823529411764706 | 0.7894736842105263 |   0.833333 | 19.0      |\n",
+      "| common cold                     | 0.8421052631578947 | 0.8888888888888888 |   0.864865 | 18.0      |\n",
+      "| dengue                          | 0.6086956521739131 | 0.875              |   0.717949 | 16.0      |\n",
+      "| diabetes                        | 1.0                | 0.7368421052631579 |   0.848485 | 19.0      |\n",
+      "| dimorphic hemorrhoids           | 1.0                | 1.0                |   1        | 17.0      |\n",
+      "| drug reaction                   | 1.0                | 0.9375             |   0.967742 | 16.0      |\n",
+      "| fungal infection                | 1.0                | 1.0                |   1        | 18.0      |\n",
+      "| gastroesophageal reflux disease | 0.8461538461538461 | 1.0                |   0.916667 | 22.0      |\n",
+      "| hypertension                    | 0.9473684210526315 | 1.0                |   0.972973 | 18.0      |\n",
+      "| impetigo                        | 1.0                | 0.96               |   0.979592 | 25.0      |\n",
+      "| jaundice                        | 1.0                | 1.0                |   1        | 17.0      |\n",
+      "| malaria                         | 1.0                | 1.0                |   1        | 23.0      |\n",
+      "| migraine                        | 1.0                | 0.8947368421052632 |   0.944444 | 19.0      |\n",
+      "| peptic ulcer disease            | 1.0                | 0.8181818181818182 |   0.9      | 22.0      |\n",
+      "| pneumonia                       | 1.0                | 0.9166666666666666 |   0.956522 | 24.0      |\n",
+      "| psoriasis                       | 1.0                | 0.9545454545454546 |   0.976744 | 22.0      |\n",
+      "| typhoid                         | 1.0                | 0.7916666666666666 |   0.883721 | 24.0      |\n",
+      "| urinary tract infection         | 0.9411764705882353 | 1.0                |   0.969697 | 16.0      |\n",
+      "| varicose veins                  | 1.0                | 1.0                |   1        | 17.0      |\n",
+      "| accuracy                        |                    |                    |   0.928571 |           |\n",
+      "| macro avg                       | 0.9342273630540053 | 0.9313739921141236 |   0.928329 |           |\n",
+      "| weighted avg                    | 0.9399432644351206 | 0.9285714285714286 |   0.929884 |           |\n",
+      "+---------------------------------+--------------------+--------------------+------------+-----------+\n"
+     ]
+    }
+   ],
+   "source": [
+    "evaluate_method(lemmatize_text_spacy)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.20"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/conda_nlp_environment.yml
+++ b/conda_nlp_environment.yml
@@ -98,7 +98,6 @@ dependencies:
  - pandocfilters=1.5.0=pyhd3eb1b0_0
  - parso=0.8.3=pyhd3eb1b0_0
  - pickleshare=0.7.5=pyhd3eb1b0_1003
-  - pip=24.2=py39haa95532_0
  - platformdirs=3.10.0=py39haa95532_0
  - ply=3.11=py39haa95532_0
  - preshed=3.0.6=py39h6c2663c_0
@@ -182,11 +181,14 @@ dependencies:
  - pip:
      - anyascii==0.3.2
      - contractions==0.1.73
+      - emoji==2.14.0
      - gensim==4.3.3
      - huggingface-hub==0.26.2
+      - pip==24.3.1
      - pyahocorasick==2.1.0
      - safetensors==0.4.5
      - scikit-learn==1.5.2
+      - stanza==1.9.2
      - textblob==0.18.0.post0
      - textsearch==0.0.24
      - threadpoolctl==3.5.0