Commit 6cc41f04 authored by Almouhannad Hafez's avatar Almouhannad Hafez

(6) Add synsets with ngrams and parsing tree atts

parent e46bb1ef
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ***Setup***"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"\n",
"from nltk.corpus import wordnet as wn\n",
"\n",
"import numpy as np\n",
"\n",
"import pandas as pd\n",
"\n",
"import pickle\n",
"\n",
"from sklearn.feature_selection import SelectKBest, chi2\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.naive_bayes import MultinomialNB\n",
"from sklearn.metrics import accuracy_score, classification_report\n",
"\n",
"from tabulate import tabulate"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Warming up PyWSD (takes ~10 secs)... took 12.379048109054565 secs.\n"
]
}
],
"source": [
"from pywsd.lesk import simple_lesk"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"import os\n",
"parent_dir = os.path.abspath('..')\n",
"sys.path.append(parent_dir)\n",
"from constants import CONSTANTS\n",
"%load_ext autoreload\n",
"%autoreload 2"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ***Load dataset***"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"train_df = pd.read_csv(CONSTANTS.AUGMENTED_TRAIN_SET_PATH)\n",
"test_df = pd.read_csv(CONSTANTS.AUGMENTED_TEST_SET_PATH)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ***Load dep. parsing results***\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\TOSHIBA\\AppData\\Roaming\\Python\\Python39\\site-packages\\networkx\\utils\\backends.py:135: RuntimeWarning: networkx backend defined more than once: nx-loopback\n",
" backends.update(_get_backends(\"networkx.backends\"))\n"
]
}
],
"source": [
"with open(CONSTANTS.DEP_PARSED_TEXTS_OBJECT_PATH, 'rb') as f:\n",
" loaded_data = pickle.load(f)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ***Helper functions***"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### ***Get processed text by row id***"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# Each sentence in the dataset has an id, and a document contain its stanza processing\n",
"def get_doc_by_id(target_id):\n",
" for obj in loaded_data:\n",
" if obj[\"id\"] == target_id:\n",
" return obj[\"processed_text\"]\n",
" return None # Return None if not found\n",
"\n",
"def get_text_tokens(text_id):\n",
" processed_text = get_doc_by_id(text_id)\n",
" tokens = [word.text for sent in processed_text.sentences for word in sent.words]\n",
" return tokens\n",
"\n",
"def get_wsd_synsets(text_id):\n",
" words = get_text_tokens(text_id)\n",
" text = get_doc_by_id(text_id).text\n",
" synsets = []\n",
" for word in words:\n",
" syn = simple_lesk(text, word) # Disambiguate based on context\n",
" if syn:\n",
" synsets.append(syn.name())\n",
" return \" \".join(synsets)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### ***Features extraction***\n",
"**Dependency Relation Tuples:** \n",
"- `(head_word, dependent_word, dependency_relation)`\n",
"- `n1 -> n2 grams`"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def features_extraction(row_id, n1=1, n2=2):\n",
" doc = get_doc_by_id(row_id)\n",
" feature_tuples = set() # Use a set to avoid duplicates\n",
" \n",
" # Extract dependency relations\n",
" for sentence in doc.sentences:\n",
" for word in sentence.words:\n",
" # Dependency relation tuples\n",
" if word.head > 0: # If not root\n",
" head = sentence.words[word.head - 1] # Adjust head index\n",
" feature_tuples.add((head.lemma, word.lemma, word.deprel))\n",
" \n",
" # Extract n-grams from n1 to n2\n",
" for sentence in doc.sentences:\n",
" words = [word.lemma for word in sentence.words]\n",
" \n",
" for n in range(n1, n2 + 1): # Loop from n1 to n2 inclusive\n",
" for i in range(len(words) - n + 1):\n",
" n_gram = tuple(words[i:i+n]) # Create a tuple for the n-gram\n",
" feature_tuples.add(n_gram)\n",
" flatten_features = ['_'.join(map(str, t)) for t in feature_tuples]\n",
"\n",
" synsets = get_wsd_synsets(row_id)\n",
" flatten_features.extend(synsets.split())\n",
"\n",
" return flatten_features"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### ***Features selection***\n",
"- Using `SelectKBest`"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"def select_features(X_train, X_test, y_train, scorer, k_value):\n",
"\n",
" # Apply features selection\n",
" selector = SelectKBest(score_func=scorer, k=k_value)\n",
" X_train_selected = selector.fit_transform(X_train, y_train)\n",
" X_test_selected = selector.transform(X_test)\n",
" return X_train_selected, X_test_selected \n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### ***Plot train, and test accuracies vs number_of_features***\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"def plot_accuracies(X_train, X_test, y_train, y_test, model, k_start=500, k_end=4500, step=250):\n",
" train_accuracies = []\n",
" test_accuracies = []\n",
" features_counts = []\n",
" for i in range(k_start, k_end + 1, step):\n",
" X_train_selected, X_test_selected = select_features(X_train, X_test, y_train, chi2, i)\n",
" model.fit(X_train_selected, y_train)\n",
" \n",
" # Training set acc\n",
" y_train_pred = model.predict(X_train_selected)\n",
" train_accuracy = accuracy_score(y_train, y_train_pred)\n",
" train_accuracies.append(train_accuracy)\n",
"\n",
" # Testing set acc\n",
" y_pred = model.predict(X_test_selected)\n",
" test_accuracy = accuracy_score(y_test, y_pred)\n",
" test_accuracies.append(test_accuracy)\n",
"\n",
" features_counts.append(i)\n",
"\n",
" # Plotting the accuracies\n",
" plt.figure(figsize=(10, 6))\n",
" plt.plot(features_counts, train_accuracies, label='Train Accuracy', marker='.')\n",
" plt.plot(features_counts, test_accuracies, label='Test Accuracy', marker='.')\n",
" plt.title('Train and Test Accuracy vs Number of Features')\n",
" plt.xlabel('Number of Features Selected')\n",
" plt.ylabel('Accuracy')\n",
" plt.legend()\n",
" plt.grid()\n",
"\n",
" # Finding closest points (3)\n",
" differences = np.abs(np.array(train_accuracies) - np.array(test_accuracies))\n",
" closest_indices = np.argsort(differences)[:3] # indices of the three smallest differences\n",
" colors = ['darkgreen', 'mediumseagreen', 'lightgreen']\n",
"\n",
" # Draw a rect\n",
" for i, idx in enumerate(closest_indices):\n",
" x = features_counts[idx] - 5\n",
" y_bottom = min(train_accuracies[idx], test_accuracies[idx])\n",
" y_top = max(train_accuracies[idx], test_accuracies[idx])\n",
" height = y_top - y_bottom\n",
"\n",
" plt.gca().add_patch(plt.Rectangle(\n",
" (x, y_bottom), 10, height,\n",
" color=colors[i], alpha=0.5\n",
" ))\n",
" \n",
" # Print the number of selected features for each closest point\n",
" print(f\"Closest Point {i+1}: Number of Features = {features_counts[idx]}, Train Accuracy = {train_accuracies[idx]}, Test Accuracy = {test_accuracies[idx]}\")\n",
"\n",
"\n",
" plt.show() "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### ***Evaluate model***\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"def print_clf_report_as_table(report):\n",
" data = []\n",
" for key, value in report.items():\n",
" if key != 'accuracy' and key != 'macro avg' and key != 'weighted avg':\n",
" data.append([key, value['precision'], value['recall'], value['f1-score'], value['support']])\n",
"\n",
" data.append(['accuracy', '', '', report['accuracy'], ''])\n",
"\n",
" data.append(['macro avg', report['macro avg']['precision'], report['macro avg']['recall'], report['macro avg']['f1-score'], ''])\n",
"\n",
" data.append(['weighted avg', report['weighted avg']['precision'], report['weighted avg']['recall'], report['weighted avg']['f1-score'], ''])\n",
"\n",
" print(tabulate(data, headers=['Class', 'Precision', 'Recall', 'F1-score', 'Support'], tablefmt='psql'))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"def evaluate_model(X_train, X_test, y_train, y_test, scorer, k_value, model):\n",
" X_train_selected, X_test_selected = select_features(X_train, X_test, y_train, scorer, k_value)\n",
" model.fit(X_train_selected, y_train)\n",
" \n",
" # Training set acc\n",
" y_train_pred = model.predict(X_train_selected)\n",
" train_accuracy = accuracy_score(y_train, y_train_pred)\n",
"\n",
" # Testing set acc\n",
" y_pred = model.predict(X_test_selected)\n",
" test_accuracy = accuracy_score(y_test, y_pred)\n",
"\n",
" print(f'Train Accuracy: {train_accuracy}')\n",
" print(f'Test Accuracy: {test_accuracy}')\n",
" print(f'Difference: {train_accuracy-test_accuracy}') \n",
" # Print classification report\n",
" report = classification_report(y_test, y_pred, output_dict=True)\n",
" print_clf_report_as_table(report)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ***Use Dependecy features with N-Grams and synsets with WSD***\n"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"n1 = 1\n",
"n2 = 3\n",
"def extractor(row_id):\n",
" return features_extraction(row_id, n1, n2)\n",
"\n",
"train_df[\"features\"] = train_df[\"Id\"].apply(features_extraction)\n",
"test_df[\"features\"] = test_df[\"Id\"].apply(extractor)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train set shape after features extraction: (4320, 47225)\n",
"Test set shape after features extraction: (480, 47225)\n"
]
}
],
"source": [
"\n",
"all_features_flat_train = [' '.join(features) for features in train_df[\"features\"]]\n",
"all_features_flat_test = [' '.join(features) for features in test_df[\"features\"]]\n",
"\n",
"vectorizer = TfidfVectorizer()\n",
"X_train = vectorizer.fit_transform(all_features_flat_train)\n",
"X_test = vectorizer.transform(all_features_flat_test)\n",
"print(f\"Train set shape after features extraction: {X_train.shape}\")\n",
"print(f\"Test set shape after features extraction: {X_test.shape}\")\n",
"\n",
"y_train = train_df[\"label\"]\n",
"y_test = test_df[\"label\"]"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Closest Point 1: Number of Features = 500, Train Accuracy = 0.9414351851851852, Test Accuracy = 0.9229166666666667\n",
"Closest Point 2: Number of Features = 5500, Train Accuracy = 0.9962962962962963, Test Accuracy = 0.9708333333333333\n",
"Closest Point 3: Number of Features = 2500, Train Accuracy = 0.9912037037037037, Test Accuracy = 0.9625\n"
]
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Make sure to update step in plotting before running this cell\n",
"model = MultinomialNB(alpha = 0.01)\n",
"plot_accuracies(X_train, X_test, y_train, y_test, model, k_end = 47000, step=1000)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train Accuracy: 0.9962962962962963\n",
"Test Accuracy: 0.9708333333333333\n",
"Difference: 0.02546296296296302\n",
"+---------------------------------+--------------------+--------------------+------------+-----------+\n",
"| Class | Precision | Recall | F1-score | Support |\n",
"|---------------------------------+--------------------+--------------------+------------+-----------|\n",
"| Acne | 1.0 | 1.0 | 1 | 21.0 |\n",
"| Arthritis | 1.0 | 1.0 | 1 | 20.0 |\n",
"| Bronchial Asthma | 1.0 | 0.9473684210526315 | 0.972973 | 19.0 |\n",
"| Cervical spondylosis | 0.9545454545454546 | 1.0 | 0.976744 | 21.0 |\n",
"| Chicken pox | 0.7142857142857143 | 1.0 | 0.833333 | 15.0 |\n",
"| Common Cold | 1.0 | 1.0 | 1 | 21.0 |\n",
"| Dengue | 1.0 | 0.7727272727272727 | 0.871795 | 22.0 |\n",
"| Dimorphic Hemorrhoids | 1.0 | 1.0 | 1 | 19.0 |\n",
"| Fungal infection | 1.0 | 1.0 | 1 | 26.0 |\n",
"| Hypertension | 1.0 | 1.0 | 1 | 18.0 |\n",
"| Impetigo | 1.0 | 1.0 | 1 | 23.0 |\n",
"| Jaundice | 1.0 | 1.0 | 1 | 22.0 |\n",
"| Malaria | 1.0 | 1.0 | 1 | 17.0 |\n",
"| Migraine | 1.0 | 1.0 | 1 | 24.0 |\n",
"| Pneumonia | 1.0 | 1.0 | 1 | 22.0 |\n",
"| Psoriasis | 1.0 | 0.8235294117647058 | 0.903226 | 17.0 |\n",
"| Typhoid | 1.0 | 1.0 | 1 | 18.0 |\n",
"| Varicose Veins | 1.0 | 0.96 | 0.979592 | 25.0 |\n",
"| allergy | 0.7894736842105263 | 1.0 | 0.882353 | 15.0 |\n",
"| diabetes | 1.0 | 0.9411764705882353 | 0.969697 | 17.0 |\n",
"| drug reaction | 1.0 | 1.0 | 1 | 16.0 |\n",
"| gastroesophageal reflux disease | 0.95 | 0.9047619047619048 | 0.926829 | 21.0 |\n",
"| peptic ulcer disease | 0.9444444444444444 | 0.9444444444444444 | 0.944444 | 18.0 |\n",
"| urinary tract infection | 0.9583333333333334 | 1.0 | 0.978723 | 23.0 |\n",
"| accuracy | | | 0.970833 | |\n",
"| macro avg | 0.9712951096174781 | 0.9705836635557997 | 0.968321 | |\n",
"| weighted avg | 0.9762364837282601 | 0.9708333333333333 | 0.971284 | |\n",
"+---------------------------------+--------------------+--------------------+------------+-----------+\n"
]
}
],
"source": [
"model = MultinomialNB(alpha=0.01)\n",
"evaluate_model(X_train, X_test, y_train, y_test, chi2, 5500, model)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "NLP",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.20"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
\ No newline at end of file
...@@ -108,26 +108,28 @@ ...@@ -108,26 +108,28 @@
> ***Using augmented dataset*** > ***Using augmented dataset***
| Case\\Criterion | Accuracy(Train) | Accuracy(Test) | Difference(%) | Precision(Test-Average) | Recall(Test-Average) | F1-Score(Test-Average) | Notes | | Case\\Criterion | Accuracy(Train) | Accuracy(Test) | Difference(%) | Precision(Test-Average) | Recall(Test-Average) | F1-Score(Test-Average) | Notes |
| -------------------------------------------------- | --------------- | -------------- | ------------- | ----------------------- | -------------------- | ---------------------- | ------------------------- | | ------------------------------------------------------------ | --------------- | -------------- | ------------- | ----------------------- | -------------------- | ---------------------- | ------------------------- |
| nltk stemmer | 0.9852 | 0.9604 | 2.5 | 0.9593 | 0.9587 | 0.9574 | alpha=0.1, 450features | | nltk stemmer | 0.9852 | 0.9604 | 2.5 | 0.9593 | 0.9587 | 0.9574 | alpha=0.1, 450features |
| nltk lemmatizer | 0.9891 | 0.9625 | 2.7 | 0.9635 | 0.9626 | 0.9608 | alpha=0.1, 700features | | nltk lemmatizer | 0.9891 | 0.9625 | 2.7 | 0.9635 | 0.9626 | 0.9608 | alpha=0.1, 700features |
| Stanza lemmatizer | 0.9843 | 0.9646 | 2.0 | 0.9652 | 0.9642 | 0.9623 | alpha=0.1, 550features | | Stanza lemmatizer | 0.9843 | 0.9646 | 2.0 | 0.9652 | 0.9642 | 0.9623 | alpha=0.1, 550features |
| SpaCy lemmatizer | 0.9657 | 0.9563 | 0.9 | 0.9582 | 0.9550 | 0.9526 | alpha=0.1, 300features | | SpaCy lemmatizer | 0.9657 | 0.9563 | 0.9 | 0.9582 | 0.9550 | 0.9526 | alpha=0.1, 300features |
| Lemma + Verbs only | 0.7229 | 0.6438 | 7.9 | 0.6675 | 0.6400 | 0.6341 | alpha=0.1, 350features | | Lemma + Verbs only | 0.7229 | 0.6438 | 7.9 | 0.6675 | 0.6400 | 0.6341 | alpha=0.1, 350features |
| Lemma + Adjectives only | 0.8037 | 0.6250 | 17.9 | 0.6531 | 0.6128 | 0.6057 | alpha=0.1, 450features | | Lemma + Adjectives only | 0.8037 | 0.6250 | 17.9 | 0.6531 | 0.6128 | 0.6057 | alpha=0.1, 450features |
| Lemma + Nouns only | 0.9766 | 0.9229 | 5.4 | 0.9230 | 0.9204 | 0.9175 | alpha=0.1, 850features | | Lemma + Nouns only | 0.9766 | 0.9229 | 5.4 | 0.9230 | 0.9204 | 0.9175 | alpha=0.1, 850features |
| Text + (1,2)Gram | 0.9958 | 0.9688 | 2.7 | 0.9679 | 0.9681 | 0.9662 | alpha=0.01, 3100features | | Text + (1,2)Gram | 0.9958 | 0.9688 | 2.7 | 0.9679 | 0.9681 | 0.9662 | alpha=0.01, 3100features |
| Text + (1,3)Gram | 0.9977 | 0.9708 | 2.7 | 0.9709 | 0.9704 | 0.9677 | alpha=0.01, 9600features | | Text + (1,3)Gram | 0.9977 | 0.9708 | 2.7 | 0.9709 | 0.9704 | 0.9677 | alpha=0.01, 9600features |
| Text + (1,4)Gram | 0.9956 | 0.9667 | 2.9 | 0.9671 | 0.9660 | 0.9631 | alpha=0.01, 8600features | | Text + (1,4)Gram | 0.9956 | 0.9667 | 2.9 | 0.9671 | 0.9660 | 0.9631 | alpha=0.01, 8600features |
| Text + (2,3)Gram | 0.9970 | 0.9500 | 4.7 | 0.9505 | 0.9467 | 0.9452 | alpha=0.01, 10100features | | Text + (2,3)Gram | 0.9970 | 0.9500 | 4.7 | 0.9505 | 0.9467 | 0.9452 | alpha=0.01, 10100features |
| Text + (2,4)Gram | 0.9975 | 0.9375 | 6.0 | 0.9366 | 0.9334 | 0.9311 | alpha=0.01, 16600features | | Text + (2,4)Gram | 0.9975 | 0.9375 | 6.0 | 0.9366 | 0.9334 | 0.9311 | alpha=0.01, 16600features |
| Stanza Dep. Relation tuples | 0.9995 | 0.9521 | 4.7 | 0.9513 | 0.9503 | 0.9484 | alpha=0.01, 8000features | | Stanza Dep. Relation tuples | 0.9995 | 0.9521 | 4.7 | 0.9513 | 0.9503 | 0.9484 | alpha=0.01, 8000features |
| Stanza Dep.Relation+POS Relations+Headwords tuples | 0.9986 | 0.9479 | 5.1 | 0.9481 | 0.9471 | 0.9440 | alpha=0.01, 7500features | | Stanza Dep.Relation+POS Relations+Headwords tuples | 0.9986 | 0.9479 | 5.1 | 0.9481 | 0.9471 | 0.9440 | alpha=0.01, 7500features |
| Stanza Dep. Relation tuples + (1,3) Grams | 1.0000 | 0.9750 | 2.5 | 0.9758 | 0.9747 | 0.9734 | alpha=0.01, 66000features | | Stanza Dep. Relation tuples + (1,3) Grams | 1.0000 | 0.9750 | 2.5 | 0.9758 | 0.9747 | 0.9734 | alpha=0.01, 66000features |
| BO synsets | 0.9782 | 0.9333 | 4.5 | 0.9325 | 0.9308 | 0.9272 | alpha=0.01, 1500features | | BO synsets | 0.9782 | 0.9333 | 4.5 | 0.9325 | 0.9308 | 0.9272 | alpha=0.01, 1500features |
| BO synsets + POS filtering | 0.9810 | 0.9271 | 5.4 | 0.9287 | 0.9256 | 0.9224 | alpha=0.01, 1500features | | BO synsets + POS filtering | 0.9810 | 0.9271 | 5.4 | 0.9287 | 0.9256 | 0.9224 | alpha=0.01, 1500features |
| BO synsets + WSD | 0.9961 | 0.9563 | 4.0 | 0.9594 | 0.9564 | 0.9542 | alpha=0.01,1750features | | BO synsets + WSD | 0.9961 | 0.9563 | 4.0 | 0.9594 | 0.9564 | 0.9542 | alpha=0.01,1750features |
| BO synsets + WSD + Stanza Dep. Relation tuples + (1,3) Grams | 0.9963 | 0.9708 | 2.5 | 0.9713 | 0.9706 | 0.9683 | alpha=0.01,5500features |
| | | | | | | | |
--- ---
> ***Applied features selection and model's hyperparameters tuning*** > ***Applied features selection and model's hyperparameters tuning***
......
No preview for this file type
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment