Commit 32c7b8ac authored by Almouhannad Hafez's avatar Almouhannad Hafez

(3) Add features extraction to N-Grams

parent 95888513
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ***Setup***"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.naive_bayes import MultinomialNB\n",
"from sklearn.metrics import accuracy_score, classification_report\n",
"\n",
"from tabulate import tabulate\n",
"\n",
"import pandas as pd\n",
"\n",
"from constants import CONSTANTS"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import nltk\n",
"from nltk.stem import PorterStemmer\n",
"from nltk.tokenize import word_tokenize\n",
"from nltk.corpus import stopwords"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The NLP process we're practicing is using N-grams"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"***Some helper functions***"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"stemmer = PorterStemmer()\n",
"\n",
"def stem_text(text):\n",
" tokens = word_tokenize(text) # Tokenize the text\n",
" stemmed_tokens = [stemmer.stem(token) for token in tokens] # Apply stemming\n",
" return ' '.join(stemmed_tokens)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def print_clf_report_as_table(report):\n",
" data = []\n",
" for key, value in report.items():\n",
" if key != 'accuracy' and key != 'macro avg' and key != 'weighted avg':\n",
" data.append([key, value['precision'], value['recall'], value['f1-score'], value['support']])\n",
"\n",
" data.append(['accuracy', '', '', report['accuracy'], ''])\n",
"\n",
" data.append(['macro avg', report['macro avg']['precision'], report['macro avg']['recall'], report['macro avg']['f1-score'], ''])\n",
"\n",
" data.append(['weighted avg', report['weighted avg']['precision'], report['weighted avg']['recall'], report['weighted avg']['f1-score'], ''])\n",
"\n",
" print(tabulate(data, headers=['Class', 'Precision', 'Recall', 'F1-score', 'Support'], tablefmt='psql'))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def get_prepared_data():\n",
" train_set = pd.read_csv(CONSTANTS.TRAIN_SET_PATH)\n",
" X_train = train_set['text']\n",
" y_train = train_set['label']\n",
" test_set = pd.read_csv(CONSTANTS.TEST_SET_PATH)\n",
" X_test = test_set['text']\n",
" y_test = test_set['label'] \n",
" return X_train, X_test, y_train, y_test "
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def evaluate_n_grams(n1, n2):\n",
" # Read data\n",
" X_train, X_test, y_train, y_test = get_prepared_data()\n",
" # Apply TF_IDF\n",
" vectorizer = TfidfVectorizer(ngram_range=(n1, n2))\n",
" X_train = vectorizer.fit_transform(X_train)\n",
" X_test = vectorizer.transform(X_test)\n",
" # Train Naive bayes classifier\n",
" classifier = MultinomialNB()\n",
" classifier.fit(X_train, y_train) \n",
" # Evaluate model\n",
" # training set\n",
" y_train_pred = classifier.predict(X_train)\n",
" train_accuracy = accuracy_score(y_train, y_train_pred)\n",
"\n",
" # test set\n",
" y_pred = classifier.predict(X_test)\n",
" test_accuracy = accuracy_score(y_test, y_pred)\n",
"\n",
" print(f'Train Accuracy: {train_accuracy}')\n",
" print(f'Test Accuracy: {test_accuracy}')\n",
" # Print classification report\n",
" report = classification_report(y_test, y_pred, output_dict=True)\n",
" print_clf_report_as_table(report) \n",
" "
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def evaluate_n_grams_stemming(n1, n2):\n",
" # Read data\n",
" X_train, X_test, y_train, y_test = get_prepared_data()\n",
" # Apply stemming\n",
" X_train = X_train.apply(stem_text)\n",
" X_test = X_test.apply(stem_text)\n",
" # Apply TF_IDF\n",
" vectorizer = TfidfVectorizer(ngram_range=(n1, n2))\n",
" X_train = vectorizer.fit_transform(X_train)\n",
" X_test = vectorizer.transform(X_test)\n",
" # Train Naive bayes classifier\n",
" classifier = MultinomialNB()\n",
" classifier.fit(X_train, y_train) \n",
" # Evaluate model\n",
" # training set\n",
" y_train_pred = classifier.predict(X_train)\n",
" train_accuracy = accuracy_score(y_train, y_train_pred)\n",
"\n",
" # test set\n",
" y_pred = classifier.predict(X_test)\n",
" test_accuracy = accuracy_score(y_test, y_pred)\n",
"\n",
" print(f'Train Accuracy: {train_accuracy}')\n",
" print(f'Test Accuracy: {test_accuracy}')\n",
" # Print classification report\n",
" report = classification_report(y_test, y_pred, output_dict=True)\n",
" print_clf_report_as_table(report) \n",
" "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ***1- N-Grams, no unigram stemming***"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### ***1-2***"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train Accuracy: 0.9985528219971056\n",
"Test Accuracy: 0.8982683982683982\n",
"+---------------------------------+--------------------+--------------------+------------+-----------+\n",
"| Class | Precision | Recall | F1-score | Support |\n",
"|---------------------------------+--------------------+--------------------+------------+-----------|\n",
"| acne | 0.95 | 1.0 | 0.974359 | 19.0 |\n",
"| allergy | 0.8823529411764706 | 0.7894736842105263 | 0.833333 | 19.0 |\n",
"| arthritis | 0.7777777777777778 | 1.0 | 0.875 | 14.0 |\n",
"| bronchial asthma | 0.7391304347826086 | 1.0 | 0.85 | 17.0 |\n",
"| cervical spondylosis | 1.0 | 1.0 | 1 | 21.0 |\n",
"| chicken pox | 0.8333333333333334 | 0.7894736842105263 | 0.810811 | 19.0 |\n",
"| common cold | 0.7272727272727273 | 0.8888888888888888 | 0.8 | 18.0 |\n",
"| dengue | 0.5185185185185185 | 0.875 | 0.651163 | 16.0 |\n",
"| diabetes | 1.0 | 0.631578947368421 | 0.774194 | 19.0 |\n",
"| dimorphic hemorrhoids | 0.9444444444444444 | 1.0 | 0.971429 | 17.0 |\n",
"| drug reaction | 0.8666666666666667 | 0.8125 | 0.83871 | 16.0 |\n",
"| fungal infection | 0.8181818181818182 | 1.0 | 0.9 | 18.0 |\n",
"| gastroesophageal reflux disease | 0.88 | 1.0 | 0.93617 | 22.0 |\n",
"| hypertension | 0.9473684210526315 | 1.0 | 0.972973 | 18.0 |\n",
"| impetigo | 1.0 | 0.92 | 0.958333 | 25.0 |\n",
"| jaundice | 1.0 | 1.0 | 1 | 17.0 |\n",
"| malaria | 1.0 | 1.0 | 1 | 23.0 |\n",
"| migraine | 1.0 | 0.8947368421052632 | 0.944444 | 19.0 |\n",
"| peptic ulcer disease | 1.0 | 0.8636363636363636 | 0.926829 | 22.0 |\n",
"| pneumonia | 1.0 | 0.8333333333333334 | 0.909091 | 24.0 |\n",
"| psoriasis | 1.0 | 0.7727272727272727 | 0.871795 | 22.0 |\n",
"| typhoid | 1.0 | 0.625 | 0.769231 | 24.0 |\n",
"| urinary tract infection | 0.9411764705882353 | 1.0 | 0.969697 | 16.0 |\n",
"| varicose veins | 1.0 | 1.0 | 1 | 17.0 |\n",
"| accuracy | | | 0.898268 | |\n",
"| macro avg | 0.9094259814081346 | 0.9040145423533582 | 0.897398 | |\n",
"| weighted avg | 0.9190699154565779 | 0.8982683982683982 | 0.899329 | |\n",
"+---------------------------------+--------------------+--------------------+------------+-----------+\n"
]
}
],
"source": [
"evaluate_n_grams(1, 2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### ***1-3***"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train Accuracy: 0.9985528219971056\n",
"Test Accuracy: 0.9025974025974026\n",
"+---------------------------------+--------------------+--------------------+------------+-----------+\n",
"| Class | Precision | Recall | F1-score | Support |\n",
"|---------------------------------+--------------------+--------------------+------------+-----------|\n",
"| acne | 0.95 | 1.0 | 0.974359 | 19.0 |\n",
"| allergy | 0.8333333333333334 | 0.7894736842105263 | 0.810811 | 19.0 |\n",
"| arthritis | 0.7777777777777778 | 1.0 | 0.875 | 14.0 |\n",
"| bronchial asthma | 0.7083333333333334 | 1.0 | 0.829268 | 17.0 |\n",
"| cervical spondylosis | 1.0 | 1.0 | 1 | 21.0 |\n",
"| chicken pox | 0.8421052631578947 | 0.8421052631578947 | 0.842105 | 19.0 |\n",
"| common cold | 0.7619047619047619 | 0.8888888888888888 | 0.820513 | 18.0 |\n",
"| dengue | 0.56 | 0.875 | 0.682927 | 16.0 |\n",
"| diabetes | 1.0 | 0.631578947368421 | 0.774194 | 19.0 |\n",
"| dimorphic hemorrhoids | 1.0 | 1.0 | 1 | 17.0 |\n",
"| drug reaction | 0.9230769230769231 | 0.75 | 0.827586 | 16.0 |\n",
"| fungal infection | 0.782608695652174 | 1.0 | 0.878049 | 18.0 |\n",
"| gastroesophageal reflux disease | 0.875 | 0.9545454545454546 | 0.913043 | 22.0 |\n",
"| hypertension | 0.9473684210526315 | 1.0 | 0.972973 | 18.0 |\n",
"| impetigo | 1.0 | 0.96 | 0.979592 | 25.0 |\n",
"| jaundice | 1.0 | 1.0 | 1 | 17.0 |\n",
"| malaria | 1.0 | 1.0 | 1 | 23.0 |\n",
"| migraine | 1.0 | 0.8947368421052632 | 0.944444 | 19.0 |\n",
"| peptic ulcer disease | 1.0 | 0.8181818181818182 | 0.9 | 22.0 |\n",
"| pneumonia | 1.0 | 0.875 | 0.933333 | 24.0 |\n",
"| psoriasis | 1.0 | 0.8181818181818182 | 0.9 | 22.0 |\n",
"| typhoid | 1.0 | 0.6666666666666666 | 0.8 | 24.0 |\n",
"| urinary tract infection | 0.9411764705882353 | 1.0 | 0.969697 | 16.0 |\n",
"| varicose veins | 1.0 | 1.0 | 1 | 17.0 |\n",
"| accuracy | | | 0.902597 | |\n",
"| macro avg | 0.9126118741615444 | 0.9068483076377812 | 0.901162 | |\n",
"| weighted avg | 0.9214411623430815 | 0.9025974025974026 | 0.90361 | |\n",
"+---------------------------------+--------------------+--------------------+------------+-----------+\n"
]
}
],
"source": [
"evaluate_n_grams(1, 3)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### ***1-4***"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train Accuracy: 0.9985528219971056\n",
"Test Accuracy: 0.8982683982683982\n",
"+---------------------------------+--------------------+--------------------+------------+-----------+\n",
"| Class | Precision | Recall | F1-score | Support |\n",
"|---------------------------------+--------------------+--------------------+------------+-----------|\n",
"| acne | 0.95 | 1.0 | 0.974359 | 19.0 |\n",
"| allergy | 0.7894736842105263 | 0.7894736842105263 | 0.789474 | 19.0 |\n",
"| arthritis | 0.7777777777777778 | 1.0 | 0.875 | 14.0 |\n",
"| bronchial asthma | 0.7083333333333334 | 1.0 | 0.829268 | 17.0 |\n",
"| cervical spondylosis | 1.0 | 1.0 | 1 | 21.0 |\n",
"| chicken pox | 0.7894736842105263 | 0.7894736842105263 | 0.789474 | 19.0 |\n",
"| common cold | 0.7619047619047619 | 0.8888888888888888 | 0.820513 | 18.0 |\n",
"| dengue | 0.5416666666666666 | 0.8125 | 0.65 | 16.0 |\n",
"| diabetes | 1.0 | 0.7368421052631579 | 0.848485 | 19.0 |\n",
"| dimorphic hemorrhoids | 1.0 | 1.0 | 1 | 17.0 |\n",
"| drug reaction | 0.9230769230769231 | 0.75 | 0.827586 | 16.0 |\n",
"| fungal infection | 0.8571428571428571 | 1.0 | 0.923077 | 18.0 |\n",
"| gastroesophageal reflux disease | 0.875 | 0.9545454545454546 | 0.913043 | 22.0 |\n",
"| hypertension | 0.9473684210526315 | 1.0 | 0.972973 | 18.0 |\n",
"| impetigo | 1.0 | 0.96 | 0.979592 | 25.0 |\n",
"| jaundice | 1.0 | 1.0 | 1 | 17.0 |\n",
"| malaria | 1.0 | 1.0 | 1 | 23.0 |\n",
"| migraine | 1.0 | 0.8947368421052632 | 0.944444 | 19.0 |\n",
"| peptic ulcer disease | 1.0 | 0.7727272727272727 | 0.871795 | 22.0 |\n",
"| pneumonia | 1.0 | 0.8333333333333334 | 0.909091 | 24.0 |\n",
"| psoriasis | 1.0 | 0.8181818181818182 | 0.9 | 22.0 |\n",
"| typhoid | 1.0 | 0.6666666666666666 | 0.8 | 24.0 |\n",
"| urinary tract infection | 0.8888888888888888 | 1.0 | 0.941176 | 16.0 |\n",
"| varicose veins | 0.9444444444444444 | 1.0 | 0.971429 | 17.0 |\n",
"| accuracy | | | 0.898268 | |\n",
"| macro avg | 0.9064396434462223 | 0.9028070729222044 | 0.897116 | |\n",
"| weighted avg | 0.9158868387251846 | 0.8982683982683982 | 0.899595 | |\n",
"+---------------------------------+--------------------+--------------------+------------+-----------+\n"
]
}
],
"source": [
"evaluate_n_grams(1, 4)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### ***2-3***"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train Accuracy: 0.9971056439942113\n",
"Test Accuracy: 0.8831168831168831\n",
"+---------------------------------+--------------------+--------------------+------------+-----------+\n",
"| Class | Precision | Recall | F1-score | Support |\n",
"|---------------------------------+--------------------+--------------------+------------+-----------|\n",
"| acne | 0.9473684210526315 | 0.9473684210526315 | 0.947368 | 19.0 |\n",
"| allergy | 0.7 | 0.7368421052631579 | 0.717949 | 19.0 |\n",
"| arthritis | 0.8235294117647058 | 1.0 | 0.903226 | 14.0 |\n",
"| bronchial asthma | 0.7083333333333334 | 1.0 | 0.829268 | 17.0 |\n",
"| cervical spondylosis | 0.9545454545454546 | 1.0 | 0.976744 | 21.0 |\n",
"| chicken pox | 0.7727272727272727 | 0.8947368421052632 | 0.829268 | 19.0 |\n",
"| common cold | 0.7619047619047619 | 0.8888888888888888 | 0.820513 | 18.0 |\n",
"| dengue | 0.5416666666666666 | 0.8125 | 0.65 | 16.0 |\n",
"| diabetes | 1.0 | 0.631578947368421 | 0.774194 | 19.0 |\n",
"| dimorphic hemorrhoids | 0.9444444444444444 | 1.0 | 0.971429 | 17.0 |\n",
"| drug reaction | 0.9166666666666666 | 0.6875 | 0.785714 | 16.0 |\n",
"| fungal infection | 0.8571428571428571 | 1.0 | 0.923077 | 18.0 |\n",
"| gastroesophageal reflux disease | 1.0 | 0.8636363636363636 | 0.926829 | 22.0 |\n",
"| hypertension | 0.8571428571428571 | 1.0 | 0.923077 | 18.0 |\n",
"| impetigo | 1.0 | 0.96 | 0.979592 | 25.0 |\n",
"| jaundice | 0.9444444444444444 | 1.0 | 0.971429 | 17.0 |\n",
"| malaria | 1.0 | 0.9565217391304348 | 0.977778 | 23.0 |\n",
"| migraine | 0.8947368421052632 | 0.8947368421052632 | 0.894737 | 19.0 |\n",
"| peptic ulcer disease | 1.0 | 0.7727272727272727 | 0.871795 | 22.0 |\n",
"| pneumonia | 1.0 | 0.875 | 0.933333 | 24.0 |\n",
"| psoriasis | 1.0 | 0.8181818181818182 | 0.9 | 22.0 |\n",
"| typhoid | 1.0 | 0.625 | 0.769231 | 24.0 |\n",
"| urinary tract infection | 0.7894736842105263 | 0.9375 | 0.857143 | 16.0 |\n",
"| varicose veins | 1.0 | 1.0 | 1 | 17.0 |\n",
"| accuracy | | | 0.883117 | |\n",
"| macro avg | 0.8922552965896621 | 0.8876133016858132 | 0.880571 | |\n",
"| weighted avg | 0.9031294439867679 | 0.8831168831168831 | 0.883735 | |\n",
"+---------------------------------+--------------------+--------------------+------------+-----------+\n"
]
}
],
"source": [
"evaluate_n_grams(2, 3)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ***1- N-Grams, stemming text***"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### ***1-2***"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train Accuracy: 1.0\n",
"Test Accuracy: 0.8939393939393939\n",
"+---------------------------------+--------------------+--------------------+------------+-----------+\n",
"| Class | Precision | Recall | F1-score | Support |\n",
"|---------------------------------+--------------------+--------------------+------------+-----------|\n",
"| acne | 0.95 | 1.0 | 0.974359 | 19.0 |\n",
"| allergy | 0.8888888888888888 | 0.8421052631578947 | 0.864865 | 19.0 |\n",
"| arthritis | 0.8235294117647058 | 1.0 | 0.903226 | 14.0 |\n",
"| bronchial asthma | 0.7727272727272727 | 1.0 | 0.871795 | 17.0 |\n",
"| cervical spondylosis | 1.0 | 1.0 | 1 | 21.0 |\n",
"| chicken pox | 0.8333333333333334 | 0.7894736842105263 | 0.810811 | 19.0 |\n",
"| common cold | 0.7272727272727273 | 0.8888888888888888 | 0.8 | 18.0 |\n",
"| dengue | 0.5185185185185185 | 0.875 | 0.651163 | 16.0 |\n",
"| diabetes | 1.0 | 0.5789473684210527 | 0.733333 | 19.0 |\n",
"| dimorphic hemorrhoids | 0.8947368421052632 | 1.0 | 0.944444 | 17.0 |\n",
"| drug reaction | 0.8125 | 0.8125 | 0.8125 | 16.0 |\n",
"| fungal infection | 0.75 | 1.0 | 0.857143 | 18.0 |\n",
"| gastroesophageal reflux disease | 0.88 | 1.0 | 0.93617 | 22.0 |\n",
"| hypertension | 0.9473684210526315 | 1.0 | 0.972973 | 18.0 |\n",
"| impetigo | 1.0 | 0.92 | 0.958333 | 25.0 |\n",
"| jaundice | 1.0 | 1.0 | 1 | 17.0 |\n",
"| malaria | 1.0 | 1.0 | 1 | 23.0 |\n",
"| migraine | 1.0 | 0.8947368421052632 | 0.944444 | 19.0 |\n",
"| peptic ulcer disease | 1.0 | 0.7727272727272727 | 0.871795 | 22.0 |\n",
"| pneumonia | 1.0 | 0.8333333333333334 | 0.909091 | 24.0 |\n",
"| psoriasis | 1.0 | 0.7727272727272727 | 0.871795 | 22.0 |\n",
"| typhoid | 1.0 | 0.625 | 0.769231 | 24.0 |\n",
"| urinary tract infection | 0.9411764705882353 | 1.0 | 0.969697 | 16.0 |\n",
"| varicose veins | 1.0 | 1.0 | 1 | 17.0 |\n",
"| accuracy | | | 0.893939 | |\n",
"| macro avg | 0.9058354952604825 | 0.9002266635654793 | 0.892799 | |\n",
"| weighted avg | 0.9155999663087571 | 0.8939393939393939 | 0.894412 | |\n",
"+---------------------------------+--------------------+--------------------+------------+-----------+\n"
]
}
],
"source": [
"evaluate_n_grams_stemming(1, 2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### ***1-3***"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train Accuracy: 1.0\n",
"Test Accuracy: 0.8939393939393939\n",
"+---------------------------------+--------------------+--------------------+------------+-----------+\n",
"| Class | Precision | Recall | F1-score | Support |\n",
"|---------------------------------+--------------------+--------------------+------------+-----------|\n",
"| acne | 0.95 | 1.0 | 0.974359 | 19.0 |\n",
"| allergy | 0.75 | 0.7894736842105263 | 0.769231 | 19.0 |\n",
"| arthritis | 0.7777777777777778 | 1.0 | 0.875 | 14.0 |\n",
"| bronchial asthma | 0.7727272727272727 | 1.0 | 0.871795 | 17.0 |\n",
"| cervical spondylosis | 1.0 | 1.0 | 1 | 21.0 |\n",
"| chicken pox | 0.8333333333333334 | 0.7894736842105263 | 0.810811 | 19.0 |\n",
"| common cold | 0.7272727272727273 | 0.8888888888888888 | 0.8 | 18.0 |\n",
"| dengue | 0.5185185185185185 | 0.875 | 0.651163 | 16.0 |\n",
"| diabetes | 1.0 | 0.5789473684210527 | 0.733333 | 19.0 |\n",
"| dimorphic hemorrhoids | 0.9444444444444444 | 1.0 | 0.971429 | 17.0 |\n",
"| drug reaction | 1.0 | 0.75 | 0.857143 | 16.0 |\n",
"| fungal infection | 0.75 | 1.0 | 0.857143 | 18.0 |\n",
"| gastroesophageal reflux disease | 0.875 | 0.9545454545454546 | 0.913043 | 22.0 |\n",
"| hypertension | 0.9473684210526315 | 1.0 | 0.972973 | 18.0 |\n",
"| impetigo | 1.0 | 0.92 | 0.958333 | 25.0 |\n",
"| jaundice | 1.0 | 1.0 | 1 | 17.0 |\n",
"| malaria | 1.0 | 1.0 | 1 | 23.0 |\n",
"| migraine | 1.0 | 0.8947368421052632 | 0.944444 | 19.0 |\n",
"| peptic ulcer disease | 1.0 | 0.8181818181818182 | 0.9 | 22.0 |\n",
"| pneumonia | 1.0 | 0.875 | 0.933333 | 24.0 |\n",
"| psoriasis | 1.0 | 0.8181818181818182 | 0.9 | 22.0 |\n",
"| typhoid | 1.0 | 0.625 | 0.769231 | 24.0 |\n",
"| urinary tract infection | 0.9411764705882353 | 1.0 | 0.969697 | 16.0 |\n",
"| varicose veins | 1.0 | 1.0 | 1 | 17.0 |\n",
"| accuracy | | | 0.893939 | |\n",
"| macro avg | 0.9078174569047892 | 0.899059564947723 | 0.893019 | |\n",
"| weighted avg | 0.9165861513197099 | 0.8939393939393939 | 0.895007 | |\n",
"+---------------------------------+--------------------+--------------------+------------+-----------+\n"
]
}
],
"source": [
"evaluate_n_grams_stemming(1, 3)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### ***1-4***"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train Accuracy: 1.0\n",
"Test Accuracy: 0.8961038961038961\n",
"+---------------------------------+--------------------+--------------------+------------+-----------+\n",
"| Class | Precision | Recall | F1-score | Support |\n",
"|---------------------------------+--------------------+--------------------+------------+-----------|\n",
"| acne | 0.95 | 1.0 | 0.974359 | 19.0 |\n",
"| allergy | 0.75 | 0.7894736842105263 | 0.769231 | 19.0 |\n",
"| arthritis | 0.7777777777777778 | 1.0 | 0.875 | 14.0 |\n",
"| bronchial asthma | 0.7727272727272727 | 1.0 | 0.871795 | 17.0 |\n",
"| cervical spondylosis | 1.0 | 1.0 | 1 | 21.0 |\n",
"| chicken pox | 0.8333333333333334 | 0.7894736842105263 | 0.810811 | 19.0 |\n",
"| common cold | 0.7272727272727273 | 0.8888888888888888 | 0.8 | 18.0 |\n",
"| dengue | 0.5185185185185185 | 0.875 | 0.651163 | 16.0 |\n",
"| diabetes | 1.0 | 0.6842105263157895 | 0.8125 | 19.0 |\n",
"| dimorphic hemorrhoids | 0.9444444444444444 | 1.0 | 0.971429 | 17.0 |\n",
"| drug reaction | 1.0 | 0.75 | 0.857143 | 16.0 |\n",
"| fungal infection | 0.8181818181818182 | 1.0 | 0.9 | 18.0 |\n",
"| gastroesophageal reflux disease | 0.875 | 0.9545454545454546 | 0.913043 | 22.0 |\n",
"| hypertension | 0.9473684210526315 | 1.0 | 0.972973 | 18.0 |\n",
"| impetigo | 1.0 | 0.92 | 0.958333 | 25.0 |\n",
"| jaundice | 1.0 | 1.0 | 1 | 17.0 |\n",
"| malaria | 1.0 | 1.0 | 1 | 23.0 |\n",
"| migraine | 1.0 | 0.8947368421052632 | 0.944444 | 19.0 |\n",
"| peptic ulcer disease | 1.0 | 0.8181818181818182 | 0.9 | 22.0 |\n",
"| pneumonia | 1.0 | 0.875 | 0.933333 | 24.0 |\n",
"| psoriasis | 1.0 | 0.8181818181818182 | 0.9 | 22.0 |\n",
"| typhoid | 1.0 | 0.5833333333333334 | 0.736842 | 24.0 |\n",
"| urinary tract infection | 0.9411764705882353 | 1.0 | 0.969697 | 16.0 |\n",
"| varicose veins | 0.9444444444444444 | 1.0 | 0.971429 | 17.0 |\n",
"| accuracy | | | 0.896104 | |\n",
"| macro avg | 0.9083435511808835 | 0.9017094187488924 | 0.895564 | |\n",
"| weighted avg | 0.9171983337500741 | 0.8961038961038961 | 0.897198 | |\n",
"+---------------------------------+--------------------+--------------------+------------+-----------+\n"
]
}
],
"source": [
"evaluate_n_grams_stemming(1, 4)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### ***2-3***"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train Accuracy: 0.9971056439942113\n",
"Test Accuracy: 0.8766233766233766\n",
"+---------------------------------+--------------------+--------------------+------------+-----------+\n",
"| Class | Precision | Recall | F1-score | Support |\n",
"|---------------------------------+--------------------+--------------------+------------+-----------|\n",
"| acne | 0.9473684210526315 | 0.9473684210526315 | 0.947368 | 19.0 |\n",
"| allergy | 0.7142857142857143 | 0.7894736842105263 | 0.75 | 19.0 |\n",
"| arthritis | 0.8235294117647058 | 1.0 | 0.903226 | 14.0 |\n",
"| bronchial asthma | 0.7083333333333334 | 1.0 | 0.829268 | 17.0 |\n",
"| cervical spondylosis | 0.9545454545454546 | 1.0 | 0.976744 | 21.0 |\n",
"| chicken pox | 0.8333333333333334 | 0.7894736842105263 | 0.810811 | 19.0 |\n",
"| common cold | 0.7272727272727273 | 0.8888888888888888 | 0.8 | 18.0 |\n",
"| dengue | 0.5 | 0.875 | 0.636364 | 16.0 |\n",
"| diabetes | 1.0 | 0.5789473684210527 | 0.733333 | 19.0 |\n",
"| dimorphic hemorrhoids | 0.9444444444444444 | 1.0 | 0.971429 | 17.0 |\n",
"| drug reaction | 0.9230769230769231 | 0.75 | 0.827586 | 16.0 |\n",
"| fungal infection | 0.782608695652174 | 1.0 | 0.878049 | 18.0 |\n",
"| gastroesophageal reflux disease | 0.9047619047619048 | 0.8636363636363636 | 0.883721 | 22.0 |\n",
"| hypertension | 0.9 | 1.0 | 0.947368 | 18.0 |\n",
"| impetigo | 1.0 | 0.92 | 0.958333 | 25.0 |\n",
"| jaundice | 1.0 | 1.0 | 1 | 17.0 |\n",
"| malaria | 1.0 | 0.9565217391304348 | 0.977778 | 23.0 |\n",
"| migraine | 0.9444444444444444 | 0.8947368421052632 | 0.918919 | 19.0 |\n",
"| peptic ulcer disease | 1.0 | 0.7272727272727273 | 0.842105 | 22.0 |\n",
"| pneumonia | 1.0 | 0.875 | 0.933333 | 24.0 |\n",
"| psoriasis | 1.0 | 0.8181818181818182 | 0.9 | 22.0 |\n",
"| typhoid | 1.0 | 0.5416666666666666 | 0.702703 | 24.0 |\n",
"| urinary tract infection | 0.8421052631578947 | 1.0 | 0.914286 | 16.0 |\n",
"| varicose veins | 0.9444444444444444 | 1.0 | 0.971429 | 17.0 |\n",
"| accuracy | | | 0.876623 | |\n",
"| macro avg | 0.8914397714820886 | 0.8840070084907042 | 0.87559 | |\n",
"| weighted avg | 0.9017367812430991 | 0.8766233766233766 | 0.876885 | |\n",
"+---------------------------------+--------------------+--------------------+------------+-----------+\n"
]
}
],
"source": [
"evaluate_n_grams_stemming(2, 3)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.20"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment