Commit ba4e54ec authored by Almouhannad Hafez's avatar Almouhannad Hafez

(3) Add lemmatizer

parent d238e02f
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ***Setup***"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import nltk\n",
"from nltk.tokenize import word_tokenize\n",
"from nltk.stem import WordNetLemmatizer\n",
"from nltk import pos_tag\n",
"from nltk.corpus import stopwords\n",
"\n",
"import spacy\n",
"\n",
"import stanza\n",
"\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.naive_bayes import MultinomialNB\n",
"from sklearn.metrics import accuracy_score, classification_report\n",
"\n",
"from tabulate import tabulate\n",
"\n",
"import pandas as pd\n",
"\n",
"from constants import CONSTANTS"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Download necessary resources** \n",
"**Uncomment if you haven't already**"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# nltk.download('wordnet')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# nltk.download('averaged_perceptron_tagger')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# nltk.download('averaged_perceptron_tagger_eng')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# stanza.download('en')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"nlp_spacy = spacy.load('en_core_web_sm')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2024-11-02 21:15:19 WARNING: Language en package default expects mwt, which has been added\n",
"2024-11-02 21:15:19 INFO: Loading these models for language: en (English):\n",
"=================================\n",
"| Processor | Package |\n",
"---------------------------------\n",
"| tokenize | combined |\n",
"| mwt | combined |\n",
"| pos | combined_charlm |\n",
"| lemma | combined_nocharlm |\n",
"=================================\n",
"\n",
"2024-11-02 21:15:19 INFO: Using device: cpu\n",
"2024-11-02 21:15:19 INFO: Loading: tokenize\n",
"2024-11-02 21:15:21 INFO: Loading: mwt\n",
"2024-11-02 21:15:21 INFO: Loading: pos\n",
"2024-11-02 21:15:22 INFO: Loading: lemma\n",
"2024-11-02 21:15:22 INFO: Done loading processors!\n"
]
}
],
"source": [
"# nlp_stanza = stanza.Pipeline('en', download_method=None)\n",
"nlp_stanza = stanza.Pipeline('en', processors='tokenize,pos,lemma', download_method=None)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"***Some helper functions***"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"lemmatizer_nltk = WordNetLemmatizer()\n",
"\n",
"def get_wordnet_pos(treebank_tag):\n",
" if treebank_tag.startswith('J'):\n",
" return 'a' # Adjective\n",
" elif treebank_tag.startswith('V'):\n",
" return 'v' # Verb\n",
" elif treebank_tag.startswith('N'):\n",
" return 'n' # Noun\n",
" elif treebank_tag.startswith('R'):\n",
" return 'r' # Adverb\n",
" else:\n",
" return None\n",
"\n",
"def lemmatize_text_nltk(text):\n",
" tokens = word_tokenize(text)\n",
" pos_tags = pos_tag(tokens) # Get POS tags for the tokens\n",
" lemmatized_tokens = []\n",
" \n",
" for token, tag in pos_tags:\n",
" wordnet_pos = get_wordnet_pos(tag) # Convert to WordNet POS\n",
" if wordnet_pos: # If there's a valid POS tag\n",
" lemmatized_token = lemmatizer_nltk.lemmatize(token, pos=wordnet_pos)\n",
" else:\n",
" lemmatized_token = lemmatizer_nltk.lemmatize(token)\n",
" lemmatized_tokens.append(lemmatized_token)\n",
" \n",
" return ' '.join(lemmatized_tokens)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"def lemmatize_text_spacy(text):\n",
" doc = nlp_spacy(text) # Process the text with SpaCy\n",
" lemmatized_tokens = [token.lemma_ for token in doc] # Apply lemmatization\n",
" return ' '.join(lemmatized_tokens)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"def lemmatize_text_stanza(text):\n",
" doc = nlp_stanza(text) # Process the text with Stanza\n",
" lemmatized_tokens = [word.lemma for sentence in doc.sentences for word in sentence.words] # Apply lemmatization\n",
" return ' '.join(lemmatized_tokens)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"stop_words = set(stopwords.words('english'))\n",
"\n",
"def remove_stopwords(text):\n",
" tokens = word_tokenize(text) # Tokenize the text\n",
" filtered_tokens = [word for word in tokens if word not in stop_words] # Remove stop words\n",
" return ' '.join(filtered_tokens)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"def print_clf_report_as_table(report):\n",
" data = []\n",
" for key, value in report.items():\n",
" if key != 'accuracy' and key != 'macro avg' and key != 'weighted avg':\n",
" data.append([key, value['precision'], value['recall'], value['f1-score'], value['support']])\n",
"\n",
" data.append(['accuracy', '', '', report['accuracy'], ''])\n",
"\n",
" data.append(['macro avg', report['macro avg']['precision'], report['macro avg']['recall'], report['macro avg']['f1-score'], ''])\n",
"\n",
" data.append(['weighted avg', report['weighted avg']['precision'], report['weighted avg']['recall'], report['weighted avg']['f1-score'], ''])\n",
"\n",
" print(tabulate(data, headers=['Class', 'Precision', 'Recall', 'F1-score', 'Support'], tablefmt='psql'))"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"def get_prepared_data():\n",
" train_set = pd.read_csv(CONSTANTS.TRAIN_SET_PATH)\n",
" X_train = train_set['text']\n",
" y_train = train_set['label']\n",
" test_set = pd.read_csv(CONSTANTS.TEST_SET_PATH)\n",
" X_test = test_set['text']\n",
" y_test = test_set['label'] \n",
" X_train = X_train.apply(remove_stopwords)\n",
" X_test = X_test.apply(remove_stopwords) \n",
" return X_train, X_test, y_train, y_test "
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"def evaluate_method(lemmatizer):\n",
" # Read data\n",
" X_train, X_test, y_train, y_test = get_prepared_data()\n",
" # Apply lemmatization\n",
" X_train = X_train.apply(lemmatizer)\n",
" X_test = X_test.apply(lemmatizer)\n",
" # Apply TF_IDF\n",
" vectorizer = TfidfVectorizer()\n",
" X_train = vectorizer.fit_transform(X_train)\n",
" X_test = vectorizer.transform(X_test)\n",
" # Train Naive bayes classifier\n",
" classifier = MultinomialNB()\n",
" classifier.fit(X_train, y_train) \n",
" # Evaluate model\n",
" # training set\n",
" y_train_pred = classifier.predict(X_train)\n",
" train_accuracy = accuracy_score(y_train, y_train_pred)\n",
"\n",
" # test set\n",
" y_pred = classifier.predict(X_test)\n",
" test_accuracy = accuracy_score(y_test, y_pred)\n",
"\n",
" print(f'Train Accuracy: {train_accuracy}')\n",
" print(f'Test Accuracy: {test_accuracy}')\n",
" # Print classification report\n",
" report = classification_report(y_test, y_pred, output_dict=True)\n",
" print_clf_report_as_table(report) \n",
" "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ***1- Nltk lemmatizer***"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Note:** This lemmatizer uses POS tagging"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train Accuracy: 0.9942112879884226\n",
"Test Accuracy: 0.9242424242424242\n",
"+---------------------------------+--------------------+--------------------+------------+-----------+\n",
"| Class | Precision | Recall | F1-score | Support |\n",
"|---------------------------------+--------------------+--------------------+------------+-----------|\n",
"| acne | 0.95 | 1.0 | 0.974359 | 19.0 |\n",
"| allergy | 0.9444444444444444 | 0.8947368421052632 | 0.918919 | 19.0 |\n",
"| arthritis | 0.875 | 1.0 | 0.933333 | 14.0 |\n",
"| bronchial asthma | 0.8095238095238095 | 1.0 | 0.894737 | 17.0 |\n",
"| cervical spondylosis | 1.0 | 1.0 | 1 | 21.0 |\n",
"| chicken pox | 0.8333333333333334 | 0.7894736842105263 | 0.810811 | 19.0 |\n",
"| common cold | 0.8421052631578947 | 0.8888888888888888 | 0.864865 | 18.0 |\n",
"| dengue | 0.5833333333333334 | 0.875 | 0.7 | 16.0 |\n",
"| diabetes | 1.0 | 0.6842105263157895 | 0.8125 | 19.0 |\n",
"| dimorphic hemorrhoids | 1.0 | 1.0 | 1 | 17.0 |\n",
"| drug reaction | 0.8333333333333334 | 0.9375 | 0.882353 | 16.0 |\n",
"| fungal infection | 0.9 | 1.0 | 0.947368 | 18.0 |\n",
"| gastroesophageal reflux disease | 0.8461538461538461 | 1.0 | 0.916667 | 22.0 |\n",
"| hypertension | 0.9473684210526315 | 1.0 | 0.972973 | 18.0 |\n",
"| impetigo | 1.0 | 0.96 | 0.979592 | 25.0 |\n",
"| jaundice | 1.0 | 1.0 | 1 | 17.0 |\n",
"| malaria | 1.0 | 1.0 | 1 | 23.0 |\n",
"| migraine | 1.0 | 0.8947368421052632 | 0.944444 | 19.0 |\n",
"| peptic ulcer disease | 1.0 | 0.8181818181818182 | 0.9 | 22.0 |\n",
"| pneumonia | 1.0 | 0.9166666666666666 | 0.956522 | 24.0 |\n",
"| psoriasis | 1.0 | 0.8181818181818182 | 0.9 | 22.0 |\n",
"| typhoid | 1.0 | 0.7916666666666666 | 0.883721 | 24.0 |\n",
"| urinary tract infection | 0.9411764705882353 | 1.0 | 0.969697 | 16.0 |\n",
"| varicose veins | 1.0 | 1.0 | 1 | 17.0 |\n",
"| accuracy | | | 0.924242 | |\n",
"| macro avg | 0.9294071772883692 | 0.9278851563884457 | 0.923453 | |\n",
"| weighted avg | 0.9363443369414178 | 0.9242424242424242 | 0.925164 | |\n",
"+---------------------------------+--------------------+--------------------+------------+-----------+\n"
]
}
],
"source": [
"evaluate_method(lemmatize_text_nltk)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ***2- Stanza lemmatizer***"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"evaluate_method(lemmatize_text_stanza)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ***3- SpaCy lemmatizer***"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train Accuracy: 0.9956584659913169\n",
"Test Accuracy: 0.9285714285714286\n",
"+---------------------------------+--------------------+--------------------+------------+-----------+\n",
"| Class | Precision | Recall | F1-score | Support |\n",
"|---------------------------------+--------------------+--------------------+------------+-----------|\n",
"| acne | 0.95 | 1.0 | 0.974359 | 19.0 |\n",
"| allergy | 0.7894736842105263 | 0.7894736842105263 | 0.789474 | 19.0 |\n",
"| arthritis | 0.875 | 1.0 | 0.933333 | 14.0 |\n",
"| bronchial asthma | 0.7391304347826086 | 1.0 | 0.85 | 17.0 |\n",
"| cervical spondylosis | 1.0 | 1.0 | 1 | 21.0 |\n",
"| chicken pox | 0.8823529411764706 | 0.7894736842105263 | 0.833333 | 19.0 |\n",
"| common cold | 0.8421052631578947 | 0.8888888888888888 | 0.864865 | 18.0 |\n",
"| dengue | 0.6086956521739131 | 0.875 | 0.717949 | 16.0 |\n",
"| diabetes | 1.0 | 0.7368421052631579 | 0.848485 | 19.0 |\n",
"| dimorphic hemorrhoids | 1.0 | 1.0 | 1 | 17.0 |\n",
"| drug reaction | 1.0 | 0.9375 | 0.967742 | 16.0 |\n",
"| fungal infection | 1.0 | 1.0 | 1 | 18.0 |\n",
"| gastroesophageal reflux disease | 0.8461538461538461 | 1.0 | 0.916667 | 22.0 |\n",
"| hypertension | 0.9473684210526315 | 1.0 | 0.972973 | 18.0 |\n",
"| impetigo | 1.0 | 0.96 | 0.979592 | 25.0 |\n",
"| jaundice | 1.0 | 1.0 | 1 | 17.0 |\n",
"| malaria | 1.0 | 1.0 | 1 | 23.0 |\n",
"| migraine | 1.0 | 0.8947368421052632 | 0.944444 | 19.0 |\n",
"| peptic ulcer disease | 1.0 | 0.8181818181818182 | 0.9 | 22.0 |\n",
"| pneumonia | 1.0 | 0.9166666666666666 | 0.956522 | 24.0 |\n",
"| psoriasis | 1.0 | 0.9545454545454546 | 0.976744 | 22.0 |\n",
"| typhoid | 1.0 | 0.7916666666666666 | 0.883721 | 24.0 |\n",
"| urinary tract infection | 0.9411764705882353 | 1.0 | 0.969697 | 16.0 |\n",
"| varicose veins | 1.0 | 1.0 | 1 | 17.0 |\n",
"| accuracy | | | 0.928571 | |\n",
"| macro avg | 0.9342273630540053 | 0.9313739921141236 | 0.928329 | |\n",
"| weighted avg | 0.9399432644351206 | 0.9285714285714286 | 0.929884 | |\n",
"+---------------------------------+--------------------+--------------------+------------+-----------+\n"
]
}
],
"source": [
"evaluate_method(lemmatize_text_spacy)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.20"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
......@@ -98,7 +98,6 @@ dependencies:
- pandocfilters=1.5.0=pyhd3eb1b0_0
- parso=0.8.3=pyhd3eb1b0_0
- pickleshare=0.7.5=pyhd3eb1b0_1003
- pip=24.2=py39haa95532_0
- platformdirs=3.10.0=py39haa95532_0
- ply=3.11=py39haa95532_0
- preshed=3.0.6=py39h6c2663c_0
......@@ -182,11 +181,14 @@ dependencies:
- pip:
- anyascii==0.3.2
- contractions==0.1.73
- emoji==2.14.0
- gensim==4.3.3
- huggingface-hub==0.26.2
- pip==24.3.1
- pyahocorasick==2.1.0
- safetensors==0.4.5
- scikit-learn==1.5.2
- stanza==1.9.2
- textblob==0.18.0.post0
- textsearch==0.0.24
- threadpoolctl==3.5.0
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment