Commit 8917cc79 authored by Almouhannad Hafez's avatar Almouhannad Hafez

(0) Add dataset overview

parent 673dc40c
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -13,7 +13,6 @@ ...@@ -13,7 +13,6 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"import nltk\n",
"from nltk.stem import PorterStemmer\n", "from nltk.stem import PorterStemmer\n",
"from nltk.tokenize import word_tokenize\n", "from nltk.tokenize import word_tokenize\n",
"from nltk.corpus import stopwords\n", "from nltk.corpus import stopwords\n",
...@@ -29,58 +28,6 @@ ...@@ -29,58 +28,6 @@
"from constants import CONSTANTS" "from constants import CONSTANTS"
] ]
}, },
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Now, we have to download `Punkt Tokenizer Model`, try running following cell, if it didn't work successfully then try to download model manually from following links: [Manual installation](https://www.nltk.org/data.html), and [Model link](https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip)."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Uncomment if you haven't already**"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# nltk.download('punkt')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# nltk.download('stopwords')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# nltk.download('punkt_tab')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### You must see an output similar to the following output:\n",
"> `[nltk_data] Downloading package punkt to` \n",
"> `[nltk_data] ...\\AppData\\Roaming\\nltk_data...` \n",
"> `[nltk_data] Package punkt is already up-to-date!` \n",
"> `True`"
]
},
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
......
...@@ -42,33 +42,6 @@ ...@@ -42,33 +42,6 @@
"**Uncomment if you haven't already**" "**Uncomment if you haven't already**"
] ]
}, },
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# nltk.download('wordnet')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# nltk.download('averaged_perceptron_tagger')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# nltk.download('averaged_perceptron_tagger_eng')"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": 5,
......
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ***Setup***"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.naive_bayes import MultinomialNB\n",
"from sklearn.metrics import accuracy_score, classification_report\n",
"\n",
"from tabulate import tabulate\n",
"\n",
"import pandas as pd\n",
"\n",
"from constants import CONSTANTS"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import nltk\n",
"from nltk.stem import PorterStemmer\n",
"from nltk.tokenize import word_tokenize\n",
"from nltk.corpus import stopwords"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The NLP process we're practicing is using N-grams"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"***Some helper functions***"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"stemmer = PorterStemmer()\n",
"\n",
"def stem_text(text):\n",
" tokens = word_tokenize(text) # Tokenize the text\n",
" stemmed_tokens = [stemmer.stem(token) for token in tokens] # Apply stemming\n",
" return ' '.join(stemmed_tokens)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def print_clf_report_as_table(report):\n",
" data = []\n",
" for key, value in report.items():\n",
" if key != 'accuracy' and key != 'macro avg' and key != 'weighted avg':\n",
" data.append([key, value['precision'], value['recall'], value['f1-score'], value['support']])\n",
"\n",
" data.append(['accuracy', '', '', report['accuracy'], ''])\n",
"\n",
" data.append(['macro avg', report['macro avg']['precision'], report['macro avg']['recall'], report['macro avg']['f1-score'], ''])\n",
"\n",
" data.append(['weighted avg', report['weighted avg']['precision'], report['weighted avg']['recall'], report['weighted avg']['f1-score'], ''])\n",
"\n",
" print(tabulate(data, headers=['Class', 'Precision', 'Recall', 'F1-score', 'Support'], tablefmt='psql'))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def get_prepared_data():\n",
" train_set = pd.read_csv(CONSTANTS.TRAIN_SET_PATH)\n",
" X_train = train_set['text']\n",
" y_train = train_set['label']\n",
" test_set = pd.read_csv(CONSTANTS.TEST_SET_PATH)\n",
" X_test = test_set['text']\n",
" y_test = test_set['label'] \n",
" return X_train, X_test, y_train, y_test "
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def evaluate_n_grams(n1, n2):\n",
" # Read data\n",
" X_train, X_test, y_train, y_test = get_prepared_data()\n",
" # Apply TF_IDF\n",
" vectorizer = TfidfVectorizer(ngram_range=(n1, n2))\n",
" X_train = vectorizer.fit_transform(X_train)\n",
" X_test = vectorizer.transform(X_test)\n",
" # Train Naive bayes classifier\n",
" classifier = MultinomialNB()\n",
" classifier.fit(X_train, y_train) \n",
" # Evaluate model\n",
" # training set\n",
" y_train_pred = classifier.predict(X_train)\n",
" train_accuracy = accuracy_score(y_train, y_train_pred)\n",
"\n",
" # test set\n",
" y_pred = classifier.predict(X_test)\n",
" test_accuracy = accuracy_score(y_test, y_pred)\n",
"\n",
" print(f'Train Accuracy: {train_accuracy}')\n",
" print(f'Test Accuracy: {test_accuracy}')\n",
" # Print classification report\n",
" report = classification_report(y_test, y_pred, output_dict=True)\n",
" print_clf_report_as_table(report) \n",
" "
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def evaluate_n_grams_stemming(n1, n2):\n",
" # Read data\n",
" X_train, X_test, y_train, y_test = get_prepared_data()\n",
" # Apply stemming\n",
" X_train = X_train.apply(stem_text)\n",
" X_test = X_test.apply(stem_text)\n",
" # Apply TF_IDF\n",
" vectorizer = TfidfVectorizer(ngram_range=(n1, n2))\n",
" X_train = vectorizer.fit_transform(X_train)\n",
" X_test = vectorizer.transform(X_test)\n",
" # Train Naive bayes classifier\n",
" classifier = MultinomialNB()\n",
" classifier.fit(X_train, y_train) \n",
" # Evaluate model\n",
" # training set\n",
" y_train_pred = classifier.predict(X_train)\n",
" train_accuracy = accuracy_score(y_train, y_train_pred)\n",
"\n",
" # test set\n",
" y_pred = classifier.predict(X_test)\n",
" test_accuracy = accuracy_score(y_test, y_pred)\n",
"\n",
" print(f'Train Accuracy: {train_accuracy}')\n",
" print(f'Test Accuracy: {test_accuracy}')\n",
" # Print classification report\n",
" report = classification_report(y_test, y_pred, output_dict=True)\n",
" print_clf_report_as_table(report) \n",
" "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ***1- N-Grams, no unigram stemming***"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### ***1-2***"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train Accuracy: 0.9985528219971056\n",
"Test Accuracy: 0.8982683982683982\n",
"+---------------------------------+--------------------+--------------------+------------+-----------+\n",
"| Class | Precision | Recall | F1-score | Support |\n",
"|---------------------------------+--------------------+--------------------+------------+-----------|\n",
"| acne | 0.95 | 1.0 | 0.974359 | 19.0 |\n",
"| allergy | 0.8823529411764706 | 0.7894736842105263 | 0.833333 | 19.0 |\n",
"| arthritis | 0.7777777777777778 | 1.0 | 0.875 | 14.0 |\n",
"| bronchial asthma | 0.7391304347826086 | 1.0 | 0.85 | 17.0 |\n",
"| cervical spondylosis | 1.0 | 1.0 | 1 | 21.0 |\n",
"| chicken pox | 0.8333333333333334 | 0.7894736842105263 | 0.810811 | 19.0 |\n",
"| common cold | 0.7272727272727273 | 0.8888888888888888 | 0.8 | 18.0 |\n",
"| dengue | 0.5185185185185185 | 0.875 | 0.651163 | 16.0 |\n",
"| diabetes | 1.0 | 0.631578947368421 | 0.774194 | 19.0 |\n",
"| dimorphic hemorrhoids | 0.9444444444444444 | 1.0 | 0.971429 | 17.0 |\n",
"| drug reaction | 0.8666666666666667 | 0.8125 | 0.83871 | 16.0 |\n",
"| fungal infection | 0.8181818181818182 | 1.0 | 0.9 | 18.0 |\n",
"| gastroesophageal reflux disease | 0.88 | 1.0 | 0.93617 | 22.0 |\n",
"| hypertension | 0.9473684210526315 | 1.0 | 0.972973 | 18.0 |\n",
"| impetigo | 1.0 | 0.92 | 0.958333 | 25.0 |\n",
"| jaundice | 1.0 | 1.0 | 1 | 17.0 |\n",
"| malaria | 1.0 | 1.0 | 1 | 23.0 |\n",
"| migraine | 1.0 | 0.8947368421052632 | 0.944444 | 19.0 |\n",
"| peptic ulcer disease | 1.0 | 0.8636363636363636 | 0.926829 | 22.0 |\n",
"| pneumonia | 1.0 | 0.8333333333333334 | 0.909091 | 24.0 |\n",
"| psoriasis | 1.0 | 0.7727272727272727 | 0.871795 | 22.0 |\n",
"| typhoid | 1.0 | 0.625 | 0.769231 | 24.0 |\n",
"| urinary tract infection | 0.9411764705882353 | 1.0 | 0.969697 | 16.0 |\n",
"| varicose veins | 1.0 | 1.0 | 1 | 17.0 |\n",
"| accuracy | | | 0.898268 | |\n",
"| macro avg | 0.9094259814081346 | 0.9040145423533582 | 0.897398 | |\n",
"| weighted avg | 0.9190699154565779 | 0.8982683982683982 | 0.899329 | |\n",
"+---------------------------------+--------------------+--------------------+------------+-----------+\n"
]
}
],
"source": [
"evaluate_n_grams(1, 2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### ***1-3***"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train Accuracy: 0.9985528219971056\n",
"Test Accuracy: 0.9025974025974026\n",
"+---------------------------------+--------------------+--------------------+------------+-----------+\n",
"| Class | Precision | Recall | F1-score | Support |\n",
"|---------------------------------+--------------------+--------------------+------------+-----------|\n",
"| acne | 0.95 | 1.0 | 0.974359 | 19.0 |\n",
"| allergy | 0.8333333333333334 | 0.7894736842105263 | 0.810811 | 19.0 |\n",
"| arthritis | 0.7777777777777778 | 1.0 | 0.875 | 14.0 |\n",
"| bronchial asthma | 0.7083333333333334 | 1.0 | 0.829268 | 17.0 |\n",
"| cervical spondylosis | 1.0 | 1.0 | 1 | 21.0 |\n",
"| chicken pox | 0.8421052631578947 | 0.8421052631578947 | 0.842105 | 19.0 |\n",
"| common cold | 0.7619047619047619 | 0.8888888888888888 | 0.820513 | 18.0 |\n",
"| dengue | 0.56 | 0.875 | 0.682927 | 16.0 |\n",
"| diabetes | 1.0 | 0.631578947368421 | 0.774194 | 19.0 |\n",
"| dimorphic hemorrhoids | 1.0 | 1.0 | 1 | 17.0 |\n",
"| drug reaction | 0.9230769230769231 | 0.75 | 0.827586 | 16.0 |\n",
"| fungal infection | 0.782608695652174 | 1.0 | 0.878049 | 18.0 |\n",
"| gastroesophageal reflux disease | 0.875 | 0.9545454545454546 | 0.913043 | 22.0 |\n",
"| hypertension | 0.9473684210526315 | 1.0 | 0.972973 | 18.0 |\n",
"| impetigo | 1.0 | 0.96 | 0.979592 | 25.0 |\n",
"| jaundice | 1.0 | 1.0 | 1 | 17.0 |\n",
"| malaria | 1.0 | 1.0 | 1 | 23.0 |\n",
"| migraine | 1.0 | 0.8947368421052632 | 0.944444 | 19.0 |\n",
"| peptic ulcer disease | 1.0 | 0.8181818181818182 | 0.9 | 22.0 |\n",
"| pneumonia | 1.0 | 0.875 | 0.933333 | 24.0 |\n",
"| psoriasis | 1.0 | 0.8181818181818182 | 0.9 | 22.0 |\n",
"| typhoid | 1.0 | 0.6666666666666666 | 0.8 | 24.0 |\n",
"| urinary tract infection | 0.9411764705882353 | 1.0 | 0.969697 | 16.0 |\n",
"| varicose veins | 1.0 | 1.0 | 1 | 17.0 |\n",
"| accuracy | | | 0.902597 | |\n",
"| macro avg | 0.9126118741615444 | 0.9068483076377812 | 0.901162 | |\n",
"| weighted avg | 0.9214411623430815 | 0.9025974025974026 | 0.90361 | |\n",
"+---------------------------------+--------------------+--------------------+------------+-----------+\n"
]
}
],
"source": [
"evaluate_n_grams(1, 3)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### ***1-4***"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train Accuracy: 0.9985528219971056\n",
"Test Accuracy: 0.8982683982683982\n",
"+---------------------------------+--------------------+--------------------+------------+-----------+\n",
"| Class | Precision | Recall | F1-score | Support |\n",
"|---------------------------------+--------------------+--------------------+------------+-----------|\n",
"| acne | 0.95 | 1.0 | 0.974359 | 19.0 |\n",
"| allergy | 0.7894736842105263 | 0.7894736842105263 | 0.789474 | 19.0 |\n",
"| arthritis | 0.7777777777777778 | 1.0 | 0.875 | 14.0 |\n",
"| bronchial asthma | 0.7083333333333334 | 1.0 | 0.829268 | 17.0 |\n",
"| cervical spondylosis | 1.0 | 1.0 | 1 | 21.0 |\n",
"| chicken pox | 0.7894736842105263 | 0.7894736842105263 | 0.789474 | 19.0 |\n",
"| common cold | 0.7619047619047619 | 0.8888888888888888 | 0.820513 | 18.0 |\n",
"| dengue | 0.5416666666666666 | 0.8125 | 0.65 | 16.0 |\n",
"| diabetes | 1.0 | 0.7368421052631579 | 0.848485 | 19.0 |\n",
"| dimorphic hemorrhoids | 1.0 | 1.0 | 1 | 17.0 |\n",
"| drug reaction | 0.9230769230769231 | 0.75 | 0.827586 | 16.0 |\n",
"| fungal infection | 0.8571428571428571 | 1.0 | 0.923077 | 18.0 |\n",
"| gastroesophageal reflux disease | 0.875 | 0.9545454545454546 | 0.913043 | 22.0 |\n",
"| hypertension | 0.9473684210526315 | 1.0 | 0.972973 | 18.0 |\n",
"| impetigo | 1.0 | 0.96 | 0.979592 | 25.0 |\n",
"| jaundice | 1.0 | 1.0 | 1 | 17.0 |\n",
"| malaria | 1.0 | 1.0 | 1 | 23.0 |\n",
"| migraine | 1.0 | 0.8947368421052632 | 0.944444 | 19.0 |\n",
"| peptic ulcer disease | 1.0 | 0.7727272727272727 | 0.871795 | 22.0 |\n",
"| pneumonia | 1.0 | 0.8333333333333334 | 0.909091 | 24.0 |\n",
"| psoriasis | 1.0 | 0.8181818181818182 | 0.9 | 22.0 |\n",
"| typhoid | 1.0 | 0.6666666666666666 | 0.8 | 24.0 |\n",
"| urinary tract infection | 0.8888888888888888 | 1.0 | 0.941176 | 16.0 |\n",
"| varicose veins | 0.9444444444444444 | 1.0 | 0.971429 | 17.0 |\n",
"| accuracy | | | 0.898268 | |\n",
"| macro avg | 0.9064396434462223 | 0.9028070729222044 | 0.897116 | |\n",
"| weighted avg | 0.9158868387251846 | 0.8982683982683982 | 0.899595 | |\n",
"+---------------------------------+--------------------+--------------------+------------+-----------+\n"
]
}
],
"source": [
"evaluate_n_grams(1, 4)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### ***2-3***"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train Accuracy: 0.9971056439942113\n",
"Test Accuracy: 0.8831168831168831\n",
"+---------------------------------+--------------------+--------------------+------------+-----------+\n",
"| Class | Precision | Recall | F1-score | Support |\n",
"|---------------------------------+--------------------+--------------------+------------+-----------|\n",
"| acne | 0.9473684210526315 | 0.9473684210526315 | 0.947368 | 19.0 |\n",
"| allergy | 0.7 | 0.7368421052631579 | 0.717949 | 19.0 |\n",
"| arthritis | 0.8235294117647058 | 1.0 | 0.903226 | 14.0 |\n",
"| bronchial asthma | 0.7083333333333334 | 1.0 | 0.829268 | 17.0 |\n",
"| cervical spondylosis | 0.9545454545454546 | 1.0 | 0.976744 | 21.0 |\n",
"| chicken pox | 0.7727272727272727 | 0.8947368421052632 | 0.829268 | 19.0 |\n",
"| common cold | 0.7619047619047619 | 0.8888888888888888 | 0.820513 | 18.0 |\n",
"| dengue | 0.5416666666666666 | 0.8125 | 0.65 | 16.0 |\n",
"| diabetes | 1.0 | 0.631578947368421 | 0.774194 | 19.0 |\n",
"| dimorphic hemorrhoids | 0.9444444444444444 | 1.0 | 0.971429 | 17.0 |\n",
"| drug reaction | 0.9166666666666666 | 0.6875 | 0.785714 | 16.0 |\n",
"| fungal infection | 0.8571428571428571 | 1.0 | 0.923077 | 18.0 |\n",
"| gastroesophageal reflux disease | 1.0 | 0.8636363636363636 | 0.926829 | 22.0 |\n",
"| hypertension | 0.8571428571428571 | 1.0 | 0.923077 | 18.0 |\n",
"| impetigo | 1.0 | 0.96 | 0.979592 | 25.0 |\n",
"| jaundice | 0.9444444444444444 | 1.0 | 0.971429 | 17.0 |\n",
"| malaria | 1.0 | 0.9565217391304348 | 0.977778 | 23.0 |\n",
"| migraine | 0.8947368421052632 | 0.8947368421052632 | 0.894737 | 19.0 |\n",
"| peptic ulcer disease | 1.0 | 0.7727272727272727 | 0.871795 | 22.0 |\n",
"| pneumonia | 1.0 | 0.875 | 0.933333 | 24.0 |\n",
"| psoriasis | 1.0 | 0.8181818181818182 | 0.9 | 22.0 |\n",
"| typhoid | 1.0 | 0.625 | 0.769231 | 24.0 |\n",
"| urinary tract infection | 0.7894736842105263 | 0.9375 | 0.857143 | 16.0 |\n",
"| varicose veins | 1.0 | 1.0 | 1 | 17.0 |\n",
"| accuracy | | | 0.883117 | |\n",
"| macro avg | 0.8922552965896621 | 0.8876133016858132 | 0.880571 | |\n",
"| weighted avg | 0.9031294439867679 | 0.8831168831168831 | 0.883735 | |\n",
"+---------------------------------+--------------------+--------------------+------------+-----------+\n"
]
}
],
"source": [
"evaluate_n_grams(2, 3)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ***1- N-Grams, stemming text***"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### ***1-2***"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train Accuracy: 1.0\n",
"Test Accuracy: 0.8939393939393939\n",
"+---------------------------------+--------------------+--------------------+------------+-----------+\n",
"| Class | Precision | Recall | F1-score | Support |\n",
"|---------------------------------+--------------------+--------------------+------------+-----------|\n",
"| acne | 0.95 | 1.0 | 0.974359 | 19.0 |\n",
"| allergy | 0.8888888888888888 | 0.8421052631578947 | 0.864865 | 19.0 |\n",
"| arthritis | 0.8235294117647058 | 1.0 | 0.903226 | 14.0 |\n",
"| bronchial asthma | 0.7727272727272727 | 1.0 | 0.871795 | 17.0 |\n",
"| cervical spondylosis | 1.0 | 1.0 | 1 | 21.0 |\n",
"| chicken pox | 0.8333333333333334 | 0.7894736842105263 | 0.810811 | 19.0 |\n",
"| common cold | 0.7272727272727273 | 0.8888888888888888 | 0.8 | 18.0 |\n",
"| dengue | 0.5185185185185185 | 0.875 | 0.651163 | 16.0 |\n",
"| diabetes | 1.0 | 0.5789473684210527 | 0.733333 | 19.0 |\n",
"| dimorphic hemorrhoids | 0.8947368421052632 | 1.0 | 0.944444 | 17.0 |\n",
"| drug reaction | 0.8125 | 0.8125 | 0.8125 | 16.0 |\n",
"| fungal infection | 0.75 | 1.0 | 0.857143 | 18.0 |\n",
"| gastroesophageal reflux disease | 0.88 | 1.0 | 0.93617 | 22.0 |\n",
"| hypertension | 0.9473684210526315 | 1.0 | 0.972973 | 18.0 |\n",
"| impetigo | 1.0 | 0.92 | 0.958333 | 25.0 |\n",
"| jaundice | 1.0 | 1.0 | 1 | 17.0 |\n",
"| malaria | 1.0 | 1.0 | 1 | 23.0 |\n",
"| migraine | 1.0 | 0.8947368421052632 | 0.944444 | 19.0 |\n",
"| peptic ulcer disease | 1.0 | 0.7727272727272727 | 0.871795 | 22.0 |\n",
"| pneumonia | 1.0 | 0.8333333333333334 | 0.909091 | 24.0 |\n",
"| psoriasis | 1.0 | 0.7727272727272727 | 0.871795 | 22.0 |\n",
"| typhoid | 1.0 | 0.625 | 0.769231 | 24.0 |\n",
"| urinary tract infection | 0.9411764705882353 | 1.0 | 0.969697 | 16.0 |\n",
"| varicose veins | 1.0 | 1.0 | 1 | 17.0 |\n",
"| accuracy | | | 0.893939 | |\n",
"| macro avg | 0.9058354952604825 | 0.9002266635654793 | 0.892799 | |\n",
"| weighted avg | 0.9155999663087571 | 0.8939393939393939 | 0.894412 | |\n",
"+---------------------------------+--------------------+--------------------+------------+-----------+\n"
]
}
],
"source": [
"evaluate_n_grams_stemming(1, 2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### ***1-3***"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train Accuracy: 1.0\n",
"Test Accuracy: 0.8939393939393939\n",
"+---------------------------------+--------------------+--------------------+------------+-----------+\n",
"| Class | Precision | Recall | F1-score | Support |\n",
"|---------------------------------+--------------------+--------------------+------------+-----------|\n",
"| acne | 0.95 | 1.0 | 0.974359 | 19.0 |\n",
"| allergy | 0.75 | 0.7894736842105263 | 0.769231 | 19.0 |\n",
"| arthritis | 0.7777777777777778 | 1.0 | 0.875 | 14.0 |\n",
"| bronchial asthma | 0.7727272727272727 | 1.0 | 0.871795 | 17.0 |\n",
"| cervical spondylosis | 1.0 | 1.0 | 1 | 21.0 |\n",
"| chicken pox | 0.8333333333333334 | 0.7894736842105263 | 0.810811 | 19.0 |\n",
"| common cold | 0.7272727272727273 | 0.8888888888888888 | 0.8 | 18.0 |\n",
"| dengue | 0.5185185185185185 | 0.875 | 0.651163 | 16.0 |\n",
"| diabetes | 1.0 | 0.5789473684210527 | 0.733333 | 19.0 |\n",
"| dimorphic hemorrhoids | 0.9444444444444444 | 1.0 | 0.971429 | 17.0 |\n",
"| drug reaction | 1.0 | 0.75 | 0.857143 | 16.0 |\n",
"| fungal infection | 0.75 | 1.0 | 0.857143 | 18.0 |\n",
"| gastroesophageal reflux disease | 0.875 | 0.9545454545454546 | 0.913043 | 22.0 |\n",
"| hypertension | 0.9473684210526315 | 1.0 | 0.972973 | 18.0 |\n",
"| impetigo | 1.0 | 0.92 | 0.958333 | 25.0 |\n",
"| jaundice | 1.0 | 1.0 | 1 | 17.0 |\n",
"| malaria | 1.0 | 1.0 | 1 | 23.0 |\n",
"| migraine | 1.0 | 0.8947368421052632 | 0.944444 | 19.0 |\n",
"| peptic ulcer disease | 1.0 | 0.8181818181818182 | 0.9 | 22.0 |\n",
"| pneumonia | 1.0 | 0.875 | 0.933333 | 24.0 |\n",
"| psoriasis | 1.0 | 0.8181818181818182 | 0.9 | 22.0 |\n",
"| typhoid | 1.0 | 0.625 | 0.769231 | 24.0 |\n",
"| urinary tract infection | 0.9411764705882353 | 1.0 | 0.969697 | 16.0 |\n",
"| varicose veins | 1.0 | 1.0 | 1 | 17.0 |\n",
"| accuracy | | | 0.893939 | |\n",
"| macro avg | 0.9078174569047892 | 0.899059564947723 | 0.893019 | |\n",
"| weighted avg | 0.9165861513197099 | 0.8939393939393939 | 0.895007 | |\n",
"+---------------------------------+--------------------+--------------------+------------+-----------+\n"
]
}
],
"source": [
"evaluate_n_grams_stemming(1, 3)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### ***1-4***"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train Accuracy: 1.0\n",
"Test Accuracy: 0.8961038961038961\n",
"+---------------------------------+--------------------+--------------------+------------+-----------+\n",
"| Class | Precision | Recall | F1-score | Support |\n",
"|---------------------------------+--------------------+--------------------+------------+-----------|\n",
"| acne | 0.95 | 1.0 | 0.974359 | 19.0 |\n",
"| allergy | 0.75 | 0.7894736842105263 | 0.769231 | 19.0 |\n",
"| arthritis | 0.7777777777777778 | 1.0 | 0.875 | 14.0 |\n",
"| bronchial asthma | 0.7727272727272727 | 1.0 | 0.871795 | 17.0 |\n",
"| cervical spondylosis | 1.0 | 1.0 | 1 | 21.0 |\n",
"| chicken pox | 0.8333333333333334 | 0.7894736842105263 | 0.810811 | 19.0 |\n",
"| common cold | 0.7272727272727273 | 0.8888888888888888 | 0.8 | 18.0 |\n",
"| dengue | 0.5185185185185185 | 0.875 | 0.651163 | 16.0 |\n",
"| diabetes | 1.0 | 0.6842105263157895 | 0.8125 | 19.0 |\n",
"| dimorphic hemorrhoids | 0.9444444444444444 | 1.0 | 0.971429 | 17.0 |\n",
"| drug reaction | 1.0 | 0.75 | 0.857143 | 16.0 |\n",
"| fungal infection | 0.8181818181818182 | 1.0 | 0.9 | 18.0 |\n",
"| gastroesophageal reflux disease | 0.875 | 0.9545454545454546 | 0.913043 | 22.0 |\n",
"| hypertension | 0.9473684210526315 | 1.0 | 0.972973 | 18.0 |\n",
"| impetigo | 1.0 | 0.92 | 0.958333 | 25.0 |\n",
"| jaundice | 1.0 | 1.0 | 1 | 17.0 |\n",
"| malaria | 1.0 | 1.0 | 1 | 23.0 |\n",
"| migraine | 1.0 | 0.8947368421052632 | 0.944444 | 19.0 |\n",
"| peptic ulcer disease | 1.0 | 0.8181818181818182 | 0.9 | 22.0 |\n",
"| pneumonia | 1.0 | 0.875 | 0.933333 | 24.0 |\n",
"| psoriasis | 1.0 | 0.8181818181818182 | 0.9 | 22.0 |\n",
"| typhoid | 1.0 | 0.5833333333333334 | 0.736842 | 24.0 |\n",
"| urinary tract infection | 0.9411764705882353 | 1.0 | 0.969697 | 16.0 |\n",
"| varicose veins | 0.9444444444444444 | 1.0 | 0.971429 | 17.0 |\n",
"| accuracy | | | 0.896104 | |\n",
"| macro avg | 0.9083435511808835 | 0.9017094187488924 | 0.895564 | |\n",
"| weighted avg | 0.9171983337500741 | 0.8961038961038961 | 0.897198 | |\n",
"+---------------------------------+--------------------+--------------------+------------+-----------+\n"
]
}
],
"source": [
"evaluate_n_grams_stemming(1, 4)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### ***2-3***"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train Accuracy: 0.9971056439942113\n",
"Test Accuracy: 0.8766233766233766\n",
"+---------------------------------+--------------------+--------------------+------------+-----------+\n",
"| Class | Precision | Recall | F1-score | Support |\n",
"|---------------------------------+--------------------+--------------------+------------+-----------|\n",
"| acne | 0.9473684210526315 | 0.9473684210526315 | 0.947368 | 19.0 |\n",
"| allergy | 0.7142857142857143 | 0.7894736842105263 | 0.75 | 19.0 |\n",
"| arthritis | 0.8235294117647058 | 1.0 | 0.903226 | 14.0 |\n",
"| bronchial asthma | 0.7083333333333334 | 1.0 | 0.829268 | 17.0 |\n",
"| cervical spondylosis | 0.9545454545454546 | 1.0 | 0.976744 | 21.0 |\n",
"| chicken pox | 0.8333333333333334 | 0.7894736842105263 | 0.810811 | 19.0 |\n",
"| common cold | 0.7272727272727273 | 0.8888888888888888 | 0.8 | 18.0 |\n",
"| dengue | 0.5 | 0.875 | 0.636364 | 16.0 |\n",
"| diabetes | 1.0 | 0.5789473684210527 | 0.733333 | 19.0 |\n",
"| dimorphic hemorrhoids | 0.9444444444444444 | 1.0 | 0.971429 | 17.0 |\n",
"| drug reaction | 0.9230769230769231 | 0.75 | 0.827586 | 16.0 |\n",
"| fungal infection | 0.782608695652174 | 1.0 | 0.878049 | 18.0 |\n",
"| gastroesophageal reflux disease | 0.9047619047619048 | 0.8636363636363636 | 0.883721 | 22.0 |\n",
"| hypertension | 0.9 | 1.0 | 0.947368 | 18.0 |\n",
"| impetigo | 1.0 | 0.92 | 0.958333 | 25.0 |\n",
"| jaundice | 1.0 | 1.0 | 1 | 17.0 |\n",
"| malaria | 1.0 | 0.9565217391304348 | 0.977778 | 23.0 |\n",
"| migraine | 0.9444444444444444 | 0.8947368421052632 | 0.918919 | 19.0 |\n",
"| peptic ulcer disease | 1.0 | 0.7272727272727273 | 0.842105 | 22.0 |\n",
"| pneumonia | 1.0 | 0.875 | 0.933333 | 24.0 |\n",
"| psoriasis | 1.0 | 0.8181818181818182 | 0.9 | 22.0 |\n",
"| typhoid | 1.0 | 0.5416666666666666 | 0.702703 | 24.0 |\n",
"| urinary tract infection | 0.8421052631578947 | 1.0 | 0.914286 | 16.0 |\n",
"| varicose veins | 0.9444444444444444 | 1.0 | 0.971429 | 17.0 |\n",
"| accuracy | | | 0.876623 | |\n",
"| macro avg | 0.8914397714820886 | 0.8840070084907042 | 0.87559 | |\n",
"| weighted avg | 0.9017367812430991 | 0.8766233766233766 | 0.876885 | |\n",
"+---------------------------------+--------------------+--------------------+------------+-----------+\n"
]
}
],
"source": [
"evaluate_n_grams_stemming(2, 3)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.20"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
name: NLP name: NLP
channels: channels:
- conda-forge
- defaults - defaults
dependencies: dependencies:
- annotated-types=0.6.0=py39haa95532_0 - annotated-types=0.6.0=py39haa95532_0
...@@ -15,10 +16,11 @@ dependencies: ...@@ -15,10 +16,11 @@ dependencies:
- blas=1.0=mkl - blas=1.0=mkl
- bleach=4.1.0=pyhd3eb1b0_0 - bleach=4.1.0=pyhd3eb1b0_0
- bottleneck=1.3.7=py39h9128911_0 - bottleneck=1.3.7=py39h9128911_0
- brotli=1.0.9=ha925a31_2
- brotli-python=1.0.9=py39hd77b12b_8 - brotli-python=1.0.9=py39hd77b12b_8
- ca-certificates=2024.9.24=haa95532_0 - ca-certificates=2024.8.30=h56e8100_0
- catalogue=2.0.10=py39haa95532_0 - catalogue=2.0.10=py39haa95532_0
- certifi=2024.8.30=py39haa95532_0 - certifi=2024.8.30=pyhd8ed1ab_0
- cffi=1.17.1=py39h827c3e9_0 - cffi=1.17.1=py39h827c3e9_0
- charset-normalizer=3.3.2=pyhd3eb1b0_0 - charset-normalizer=3.3.2=pyhd3eb1b0_0
- click=8.1.7=py39haa95532_0 - click=8.1.7=py39haa95532_0
...@@ -26,6 +28,8 @@ dependencies: ...@@ -26,6 +28,8 @@ dependencies:
- colorama=0.4.6=py39haa95532_0 - colorama=0.4.6=py39haa95532_0
- comm=0.2.1=py39haa95532_0 - comm=0.2.1=py39haa95532_0
- confection=0.1.4=py39h9909e9c_0 - confection=0.1.4=py39h9909e9c_0
- contourpy=1.2.0=py39h59b6b97_0
- cycler=0.12.1=pyhd8ed1ab_0
- cymem=2.0.6=py39hd77b12b_0 - cymem=2.0.6=py39hd77b12b_0
- cython-blis=0.7.9=py39h080aedc_0 - cython-blis=0.7.9=py39h080aedc_0
- debugpy=1.6.7=py39hd77b12b_0 - debugpy=1.6.7=py39hd77b12b_0
...@@ -33,6 +37,8 @@ dependencies: ...@@ -33,6 +37,8 @@ dependencies:
- defusedxml=0.7.1=pyhd3eb1b0_0 - defusedxml=0.7.1=pyhd3eb1b0_0
- exceptiongroup=1.2.0=py39haa95532_0 - exceptiongroup=1.2.0=py39haa95532_0
- executing=0.8.3=pyhd3eb1b0_0 - executing=0.8.3=pyhd3eb1b0_0
- fonttools=4.25.0=pyhd3eb1b0_0
- freetype=2.10.4=h546665d_1
- h11=0.14.0=py39haa95532_0 - h11=0.14.0=py39haa95532_0
- httpcore=1.0.2=py39haa95532_0 - httpcore=1.0.2=py39haa95532_0
- httpx=0.27.0=py39haa95532_0 - httpx=0.27.0=py39haa95532_0
...@@ -40,6 +46,7 @@ dependencies: ...@@ -40,6 +46,7 @@ dependencies:
- idna=3.7=py39haa95532_0 - idna=3.7=py39haa95532_0
- importlib-metadata=7.0.1=py39haa95532_0 - importlib-metadata=7.0.1=py39haa95532_0
- importlib_metadata=7.0.1=hd3eb1b0_0 - importlib_metadata=7.0.1=hd3eb1b0_0
- importlib_resources=6.4.5=pyhd8ed1ab_0
- intel-openmp=2023.1.0=h59b6b97_46320 - intel-openmp=2023.1.0=h59b6b97_46320
- ipykernel=6.29.5=py39haa95532_0 - ipykernel=6.29.5=py39haa95532_0
- ipython=8.15.0=py39haa95532_0 - ipython=8.15.0=py39haa95532_0
...@@ -63,16 +70,23 @@ dependencies: ...@@ -63,16 +70,23 @@ dependencies:
- jupyterlab_pygments=0.1.2=py_0 - jupyterlab_pygments=0.1.2=py_0
- jupyterlab_server=2.27.3=py39haa95532_0 - jupyterlab_server=2.27.3=py39haa95532_0
- jupyterlab_widgets=3.0.10=py39haa95532_0 - jupyterlab_widgets=3.0.10=py39haa95532_0
- kiwisolver=1.4.4=py39hd77b12b_0
- krb5=1.20.1=h5b6d351_0 - krb5=1.20.1=h5b6d351_0
- langcodes=3.3.0=pyhd3eb1b0_0 - langcodes=3.3.0=pyhd3eb1b0_0
- lcms2=2.12=h83e58a3_0
- lerc=3.0=hd77b12b_0
- libclang=14.0.6=default_hb5a9fac_1 - libclang=14.0.6=default_hb5a9fac_1
- libclang13=14.0.6=default_h8e68704_1 - libclang13=14.0.6=default_h8e68704_1
- libdeflate=1.17=h2bbff1b_1
- libpng=1.6.39=h8cc25b3_0 - libpng=1.6.39=h8cc25b3_0
- libpq=12.17=h906ac69_0 - libpq=12.17=h906ac69_0
- libsodium=1.0.18=h62dcd97_0 - libsodium=1.0.18=h62dcd97_0
- libtiff=4.5.1=hd77b12b_0
- libwebp-base=1.3.2=h3d04722_1
- lz4-c=1.9.4=h2bbff1b_1 - lz4-c=1.9.4=h2bbff1b_1
- markdown-it-py=2.2.0=py39haa95532_1 - markdown-it-py=2.2.0=py39haa95532_1
- markupsafe=2.1.3=py39h2bbff1b_0 - markupsafe=2.1.3=py39h2bbff1b_0
- matplotlib-base=3.9.2=py39he19b0ae_0
- matplotlib-inline=0.1.6=py39haa95532_0 - matplotlib-inline=0.1.6=py39haa95532_0
- mdurl=0.1.0=py39haa95532_0 - mdurl=0.1.0=py39haa95532_0
- mistune=2.0.4=py39haa95532_0 - mistune=2.0.4=py39haa95532_0
...@@ -80,6 +94,7 @@ dependencies: ...@@ -80,6 +94,7 @@ dependencies:
- mkl-service=2.4.0=py39h2bbff1b_1 - mkl-service=2.4.0=py39h2bbff1b_1
- mkl_fft=1.3.10=py39h827c3e9_0 - mkl_fft=1.3.10=py39h827c3e9_0
- mkl_random=1.2.7=py39hc64d2fc_0 - mkl_random=1.2.7=py39hc64d2fc_0
- munkres=1.1.4=pyh9f0ad1d_0
- murmurhash=1.0.7=py39hd77b12b_0 - murmurhash=1.0.7=py39hd77b12b_0
- nbclient=0.8.0=py39haa95532_0 - nbclient=0.8.0=py39haa95532_0
- nbconvert=7.16.4=py39haa95532_0 - nbconvert=7.16.4=py39haa95532_0
...@@ -91,6 +106,7 @@ dependencies: ...@@ -91,6 +106,7 @@ dependencies:
- numexpr=2.10.1=py39h4cd664f_0 - numexpr=2.10.1=py39h4cd664f_0
- numpy=1.26.4=py39h055cbcc_0 - numpy=1.26.4=py39h055cbcc_0
- numpy-base=1.26.4=py39h65a83cf_0 - numpy-base=1.26.4=py39h65a83cf_0
- openjpeg=2.5.2=hae555c5_0
- openssl=3.0.15=h827c3e9_0 - openssl=3.0.15=h827c3e9_0
- overrides=7.4.0=py39haa95532_0 - overrides=7.4.0=py39haa95532_0
- packaging=24.1=py39haa95532_0 - packaging=24.1=py39haa95532_0
...@@ -98,6 +114,8 @@ dependencies: ...@@ -98,6 +114,8 @@ dependencies:
- pandocfilters=1.5.0=pyhd3eb1b0_0 - pandocfilters=1.5.0=pyhd3eb1b0_0
- parso=0.8.3=pyhd3eb1b0_0 - parso=0.8.3=pyhd3eb1b0_0
- pickleshare=0.7.5=pyhd3eb1b0_1003 - pickleshare=0.7.5=pyhd3eb1b0_1003
- pillow=10.4.0=py39h827c3e9_0
- pip=24.3.1=pyh8b19718_0
- platformdirs=3.10.0=py39haa95532_0 - platformdirs=3.10.0=py39haa95532_0
- ply=3.11=py39haa95532_0 - ply=3.11=py39haa95532_0
- preshed=3.0.6=py39h6c2663c_0 - preshed=3.0.6=py39h6c2663c_0
...@@ -110,6 +128,7 @@ dependencies: ...@@ -110,6 +128,7 @@ dependencies:
- pydantic=2.8.2=py39haa95532_0 - pydantic=2.8.2=py39haa95532_0
- pydantic-core=2.20.1=py39hefb1915_0 - pydantic-core=2.20.1=py39hefb1915_0
- pygments=2.15.1=py39haa95532_1 - pygments=2.15.1=py39haa95532_1
- pyparsing=3.2.0=pyhd8ed1ab_1
- pyqt=5.15.10=py39hd77b12b_0 - pyqt=5.15.10=py39hd77b12b_0
- pyqt5-sip=12.13.0=py39h2bbff1b_0 - pyqt5-sip=12.13.0=py39h2bbff1b_0
- pysocks=1.7.1=py39haa95532_0 - pysocks=1.7.1=py39haa95532_0
...@@ -172,6 +191,7 @@ dependencies: ...@@ -172,6 +191,7 @@ dependencies:
- widgetsnbextension=4.0.10=py39haa95532_0 - widgetsnbextension=4.0.10=py39haa95532_0
- win_inet_pton=1.1.0=py39haa95532_0 - win_inet_pton=1.1.0=py39haa95532_0
- winpty=0.4.3=4 - winpty=0.4.3=4
- wordcloud=1.9.3=py39h2bbff1b_0
- xz=5.4.6=h8cc25b3_1 - xz=5.4.6=h8cc25b3_1
- yaml=0.2.5=he774522_0 - yaml=0.2.5=he774522_0
- zeromq=4.3.5=hd77b12b_0 - zeromq=4.3.5=hd77b12b_0
...@@ -184,7 +204,6 @@ dependencies: ...@@ -184,7 +204,6 @@ dependencies:
- emoji==2.14.0 - emoji==2.14.0
- gensim==4.3.3 - gensim==4.3.3
- huggingface-hub==0.26.2 - huggingface-hub==0.26.2
- pip==24.3.1
- pyahocorasick==2.1.0 - pyahocorasick==2.1.0
- safetensors==0.4.5 - safetensors==0.4.5
- scikit-learn==1.5.2 - scikit-learn==1.5.2
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment