Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Sign in
Toggle navigation
N
NLP-Project
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
almohanad.hafez
NLP-Project
Commits
ba4e54ec
You need to sign in or sign up before continuing.
Commit
ba4e54ec
authored
Nov 02, 2024
by
Almouhannad Hafez
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
(3) Add lemmatizer
parent
d238e02f
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
440 additions
and
1 deletion
+440
-1
3.1.Lemmatizer.ipynb
3.1.Lemmatizer.ipynb
+437
-0
conda_nlp_environment.yml
conda_nlp_environment.yml
+3
-1
No files found.
3.1.Lemmatizer.ipynb
0 → 100644
View file @
ba4e54ec
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ***Setup***"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import nltk\n",
"from nltk.tokenize import word_tokenize\n",
"from nltk.stem import WordNetLemmatizer\n",
"from nltk import pos_tag\n",
"from nltk.corpus import stopwords\n",
"\n",
"import spacy\n",
"\n",
"import stanza\n",
"\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.naive_bayes import MultinomialNB\n",
"from sklearn.metrics import accuracy_score, classification_report\n",
"\n",
"from tabulate import tabulate\n",
"\n",
"import pandas as pd\n",
"\n",
"from constants import CONSTANTS"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Download necessary resources** \n",
"**Uncomment if you haven't already**"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# nltk.download('wordnet')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# nltk.download('averaged_perceptron_tagger')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# nltk.download('averaged_perceptron_tagger_eng')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# stanza.download('en')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"nlp_spacy = spacy.load('en_core_web_sm')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2024-11-02 21:15:19 WARNING: Language en package default expects mwt, which has been added\n",
"2024-11-02 21:15:19 INFO: Loading these models for language: en (English):\n",
"=================================\n",
"| Processor | Package |\n",
"---------------------------------\n",
"| tokenize | combined |\n",
"| mwt | combined |\n",
"| pos | combined_charlm |\n",
"| lemma | combined_nocharlm |\n",
"=================================\n",
"\n",
"2024-11-02 21:15:19 INFO: Using device: cpu\n",
"2024-11-02 21:15:19 INFO: Loading: tokenize\n",
"2024-11-02 21:15:21 INFO: Loading: mwt\n",
"2024-11-02 21:15:21 INFO: Loading: pos\n",
"2024-11-02 21:15:22 INFO: Loading: lemma\n",
"2024-11-02 21:15:22 INFO: Done loading processors!\n"
]
}
],
"source": [
"# nlp_stanza = stanza.Pipeline('en', download_method=None)\n",
"nlp_stanza = stanza.Pipeline('en', processors='tokenize,pos,lemma', download_method=None)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"***Some helper functions***"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"lemmatizer_nltk = WordNetLemmatizer()\n",
"\n",
"def get_wordnet_pos(treebank_tag):\n",
" if treebank_tag.startswith('J'):\n",
" return 'a' # Adjective\n",
" elif treebank_tag.startswith('V'):\n",
" return 'v' # Verb\n",
" elif treebank_tag.startswith('N'):\n",
" return 'n' # Noun\n",
" elif treebank_tag.startswith('R'):\n",
" return 'r' # Adverb\n",
" else:\n",
" return None\n",
"\n",
"def lemmatize_text_nltk(text):\n",
" tokens = word_tokenize(text)\n",
" pos_tags = pos_tag(tokens) # Get POS tags for the tokens\n",
" lemmatized_tokens = []\n",
" \n",
" for token, tag in pos_tags:\n",
" wordnet_pos = get_wordnet_pos(tag) # Convert to WordNet POS\n",
" if wordnet_pos: # If there's a valid POS tag\n",
" lemmatized_token = lemmatizer_nltk.lemmatize(token, pos=wordnet_pos)\n",
" else:\n",
" lemmatized_token = lemmatizer_nltk.lemmatize(token)\n",
" lemmatized_tokens.append(lemmatized_token)\n",
" \n",
" return ' '.join(lemmatized_tokens)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"def lemmatize_text_spacy(text):\n",
" doc = nlp_spacy(text) # Process the text with SpaCy\n",
" lemmatized_tokens = [token.lemma_ for token in doc] # Apply lemmatization\n",
" return ' '.join(lemmatized_tokens)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"def lemmatize_text_stanza(text):\n",
" doc = nlp_stanza(text) # Process the text with Stanza\n",
" lemmatized_tokens = [word.lemma for sentence in doc.sentences for word in sentence.words] # Apply lemmatization\n",
" return ' '.join(lemmatized_tokens)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"stop_words = set(stopwords.words('english'))\n",
"\n",
"def remove_stopwords(text):\n",
" tokens = word_tokenize(text) # Tokenize the text\n",
" filtered_tokens = [word for word in tokens if word not in stop_words] # Remove stop words\n",
" return ' '.join(filtered_tokens)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"def print_clf_report_as_table(report):\n",
" data = []\n",
" for key, value in report.items():\n",
" if key != 'accuracy' and key != 'macro avg' and key != 'weighted avg':\n",
" data.append([key, value['precision'], value['recall'], value['f1-score'], value['support']])\n",
"\n",
" data.append(['accuracy', '', '', report['accuracy'], ''])\n",
"\n",
" data.append(['macro avg', report['macro avg']['precision'], report['macro avg']['recall'], report['macro avg']['f1-score'], ''])\n",
"\n",
" data.append(['weighted avg', report['weighted avg']['precision'], report['weighted avg']['recall'], report['weighted avg']['f1-score'], ''])\n",
"\n",
" print(tabulate(data, headers=['Class', 'Precision', 'Recall', 'F1-score', 'Support'], tablefmt='psql'))"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"def get_prepared_data():\n",
" train_set = pd.read_csv(CONSTANTS.TRAIN_SET_PATH)\n",
" X_train = train_set['text']\n",
" y_train = train_set['label']\n",
" test_set = pd.read_csv(CONSTANTS.TEST_SET_PATH)\n",
" X_test = test_set['text']\n",
" y_test = test_set['label'] \n",
" X_train = X_train.apply(remove_stopwords)\n",
" X_test = X_test.apply(remove_stopwords) \n",
" return X_train, X_test, y_train, y_test "
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"def evaluate_method(lemmatizer):\n",
" # Read data\n",
" X_train, X_test, y_train, y_test = get_prepared_data()\n",
" # Apply lemmatization\n",
" X_train = X_train.apply(lemmatizer)\n",
" X_test = X_test.apply(lemmatizer)\n",
" # Apply TF_IDF\n",
" vectorizer = TfidfVectorizer()\n",
" X_train = vectorizer.fit_transform(X_train)\n",
" X_test = vectorizer.transform(X_test)\n",
" # Train Naive bayes classifier\n",
" classifier = MultinomialNB()\n",
" classifier.fit(X_train, y_train) \n",
" # Evaluate model\n",
" # training set\n",
" y_train_pred = classifier.predict(X_train)\n",
" train_accuracy = accuracy_score(y_train, y_train_pred)\n",
"\n",
" # test set\n",
" y_pred = classifier.predict(X_test)\n",
" test_accuracy = accuracy_score(y_test, y_pred)\n",
"\n",
" print(f'Train Accuracy: {train_accuracy}')\n",
" print(f'Test Accuracy: {test_accuracy}')\n",
" # Print classification report\n",
" report = classification_report(y_test, y_pred, output_dict=True)\n",
" print_clf_report_as_table(report) \n",
" "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ***1- Nltk lemmatizer***"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Note:** This lemmatizer uses POS tagging"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train Accuracy: 0.9942112879884226\n",
"Test Accuracy: 0.9242424242424242\n",
"+---------------------------------+--------------------+--------------------+------------+-----------+\n",
"| Class | Precision | Recall | F1-score | Support |\n",
"|---------------------------------+--------------------+--------------------+------------+-----------|\n",
"| acne | 0.95 | 1.0 | 0.974359 | 19.0 |\n",
"| allergy | 0.9444444444444444 | 0.8947368421052632 | 0.918919 | 19.0 |\n",
"| arthritis | 0.875 | 1.0 | 0.933333 | 14.0 |\n",
"| bronchial asthma | 0.8095238095238095 | 1.0 | 0.894737 | 17.0 |\n",
"| cervical spondylosis | 1.0 | 1.0 | 1 | 21.0 |\n",
"| chicken pox | 0.8333333333333334 | 0.7894736842105263 | 0.810811 | 19.0 |\n",
"| common cold | 0.8421052631578947 | 0.8888888888888888 | 0.864865 | 18.0 |\n",
"| dengue | 0.5833333333333334 | 0.875 | 0.7 | 16.0 |\n",
"| diabetes | 1.0 | 0.6842105263157895 | 0.8125 | 19.0 |\n",
"| dimorphic hemorrhoids | 1.0 | 1.0 | 1 | 17.0 |\n",
"| drug reaction | 0.8333333333333334 | 0.9375 | 0.882353 | 16.0 |\n",
"| fungal infection | 0.9 | 1.0 | 0.947368 | 18.0 |\n",
"| gastroesophageal reflux disease | 0.8461538461538461 | 1.0 | 0.916667 | 22.0 |\n",
"| hypertension | 0.9473684210526315 | 1.0 | 0.972973 | 18.0 |\n",
"| impetigo | 1.0 | 0.96 | 0.979592 | 25.0 |\n",
"| jaundice | 1.0 | 1.0 | 1 | 17.0 |\n",
"| malaria | 1.0 | 1.0 | 1 | 23.0 |\n",
"| migraine | 1.0 | 0.8947368421052632 | 0.944444 | 19.0 |\n",
"| peptic ulcer disease | 1.0 | 0.8181818181818182 | 0.9 | 22.0 |\n",
"| pneumonia | 1.0 | 0.9166666666666666 | 0.956522 | 24.0 |\n",
"| psoriasis | 1.0 | 0.8181818181818182 | 0.9 | 22.0 |\n",
"| typhoid | 1.0 | 0.7916666666666666 | 0.883721 | 24.0 |\n",
"| urinary tract infection | 0.9411764705882353 | 1.0 | 0.969697 | 16.0 |\n",
"| varicose veins | 1.0 | 1.0 | 1 | 17.0 |\n",
"| accuracy | | | 0.924242 | |\n",
"| macro avg | 0.9294071772883692 | 0.9278851563884457 | 0.923453 | |\n",
"| weighted avg | 0.9363443369414178 | 0.9242424242424242 | 0.925164 | |\n",
"+---------------------------------+--------------------+--------------------+------------+-----------+\n"
]
}
],
"source": [
"evaluate_method(lemmatize_text_nltk)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ***2- Stanza lemmatizer***"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"evaluate_method(lemmatize_text_stanza)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ***3- SpaCy lemmatizer***"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train Accuracy: 0.9956584659913169\n",
"Test Accuracy: 0.9285714285714286\n",
"+---------------------------------+--------------------+--------------------+------------+-----------+\n",
"| Class | Precision | Recall | F1-score | Support |\n",
"|---------------------------------+--------------------+--------------------+------------+-----------|\n",
"| acne | 0.95 | 1.0 | 0.974359 | 19.0 |\n",
"| allergy | 0.7894736842105263 | 0.7894736842105263 | 0.789474 | 19.0 |\n",
"| arthritis | 0.875 | 1.0 | 0.933333 | 14.0 |\n",
"| bronchial asthma | 0.7391304347826086 | 1.0 | 0.85 | 17.0 |\n",
"| cervical spondylosis | 1.0 | 1.0 | 1 | 21.0 |\n",
"| chicken pox | 0.8823529411764706 | 0.7894736842105263 | 0.833333 | 19.0 |\n",
"| common cold | 0.8421052631578947 | 0.8888888888888888 | 0.864865 | 18.0 |\n",
"| dengue | 0.6086956521739131 | 0.875 | 0.717949 | 16.0 |\n",
"| diabetes | 1.0 | 0.7368421052631579 | 0.848485 | 19.0 |\n",
"| dimorphic hemorrhoids | 1.0 | 1.0 | 1 | 17.0 |\n",
"| drug reaction | 1.0 | 0.9375 | 0.967742 | 16.0 |\n",
"| fungal infection | 1.0 | 1.0 | 1 | 18.0 |\n",
"| gastroesophageal reflux disease | 0.8461538461538461 | 1.0 | 0.916667 | 22.0 |\n",
"| hypertension | 0.9473684210526315 | 1.0 | 0.972973 | 18.0 |\n",
"| impetigo | 1.0 | 0.96 | 0.979592 | 25.0 |\n",
"| jaundice | 1.0 | 1.0 | 1 | 17.0 |\n",
"| malaria | 1.0 | 1.0 | 1 | 23.0 |\n",
"| migraine | 1.0 | 0.8947368421052632 | 0.944444 | 19.0 |\n",
"| peptic ulcer disease | 1.0 | 0.8181818181818182 | 0.9 | 22.0 |\n",
"| pneumonia | 1.0 | 0.9166666666666666 | 0.956522 | 24.0 |\n",
"| psoriasis | 1.0 | 0.9545454545454546 | 0.976744 | 22.0 |\n",
"| typhoid | 1.0 | 0.7916666666666666 | 0.883721 | 24.0 |\n",
"| urinary tract infection | 0.9411764705882353 | 1.0 | 0.969697 | 16.0 |\n",
"| varicose veins | 1.0 | 1.0 | 1 | 17.0 |\n",
"| accuracy | | | 0.928571 | |\n",
"| macro avg | 0.9342273630540053 | 0.9313739921141236 | 0.928329 | |\n",
"| weighted avg | 0.9399432644351206 | 0.9285714285714286 | 0.929884 | |\n",
"+---------------------------------+--------------------+--------------------+------------+-----------+\n"
]
}
],
"source": [
"evaluate_method(lemmatize_text_spacy)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.20"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
conda_nlp_environment.yml
View file @
ba4e54ec
...
@@ -98,7 +98,6 @@ dependencies:
...
@@ -98,7 +98,6 @@ dependencies:
-
pandocfilters=1.5.0=pyhd3eb1b0_0
-
pandocfilters=1.5.0=pyhd3eb1b0_0
-
parso=0.8.3=pyhd3eb1b0_0
-
parso=0.8.3=pyhd3eb1b0_0
-
pickleshare=0.7.5=pyhd3eb1b0_1003
-
pickleshare=0.7.5=pyhd3eb1b0_1003
-
pip=24.2=py39haa95532_0
-
platformdirs=3.10.0=py39haa95532_0
-
platformdirs=3.10.0=py39haa95532_0
-
ply=3.11=py39haa95532_0
-
ply=3.11=py39haa95532_0
-
preshed=3.0.6=py39h6c2663c_0
-
preshed=3.0.6=py39h6c2663c_0
...
@@ -182,11 +181,14 @@ dependencies:
...
@@ -182,11 +181,14 @@ dependencies:
-
pip
:
-
pip
:
-
anyascii==0.3.2
-
anyascii==0.3.2
-
contractions==0.1.73
-
contractions==0.1.73
-
emoji==2.14.0
-
gensim==4.3.3
-
gensim==4.3.3
-
huggingface-hub==0.26.2
-
huggingface-hub==0.26.2
-
pip==24.3.1
-
pyahocorasick==2.1.0
-
pyahocorasick==2.1.0
-
safetensors==0.4.5
-
safetensors==0.4.5
-
scikit-learn==1.5.2
-
scikit-learn==1.5.2
-
stanza==1.9.2
-
textblob==0.18.0.post0
-
textblob==0.18.0.post0
-
textsearch==0.0.24
-
textsearch==0.0.24
-
threadpoolctl==3.5.0
-
threadpoolctl==3.5.0
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment