5.1.Dep_parsing_classifier.ipynb 260 KB
Newer Older
1
{

    "cells": [
        {
            "cell_type": "markdown",
            "metadata": {},
            "source": [
                "# ***Setup***"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 1,
            "metadata": {},
            "outputs": [],
            "source": [
                "import matplotlib.pyplot as plt\n",
                "\n",
                "import numpy as np\n",
                "\n",
                "import pandas as pd\n",
                "\n",
                "import pickle\n",
                "\n",
                "from sklearn.feature_selection import SelectKBest, chi2\n",
                "from sklearn.feature_extraction.text import TfidfVectorizer\n",
                "from sklearn.naive_bayes import MultinomialNB\n",
                "from sklearn.model_selection import train_test_split\n",
                "from sklearn.metrics import accuracy_score, classification_report\n",
                "\n",
                "from tabulate import tabulate"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 2,
            "metadata": {},
            "outputs": [],
            "source": [
                "import sys\n",
                "import os\n",
                "parent_dir = os.path.abspath('..')\n",
                "sys.path.append(parent_dir)\n",
                "from constants import CONSTANTS\n",
                "%load_ext autoreload\n",
                "%autoreload 2"
            ]
        },
        {
            "cell_type": "markdown",
            "metadata": {},
            "source": [
                "## ***Load dataset***"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 3,
            "metadata": {},
            "outputs": [],
            "source": [
                "train_df = pd.read_csv(CONSTANTS.AUGMENTED_TRAIN_SET_PATH)\n",
                "test_df = pd.read_csv(CONSTANTS.AUGMENTED_TEST_SET_PATH)"
            ]
        },
        {
            "cell_type": "markdown",
            "metadata": {},
            "source": [
                "## ***Load dep. parsing results***\n"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 4,
            "metadata": {},
            "outputs": [
                {
                    "name": "stderr",
                    "output_type": "stream",
                    "text": [
                        "C:\\Users\\TOSHIBA\\AppData\\Roaming\\Python\\Python39\\site-packages\\networkx\\utils\\backends.py:135: RuntimeWarning: networkx backend defined more than once: nx-loopback\n",
                        "  backends.update(_get_backends(\"networkx.backends\"))\n"
                    ]
                }
            ],
            "source": [
                "with open(CONSTANTS.DEP_PARSED_TEXTS_OBJECT_PATH, 'rb') as f:\n",
                "    loaded_data = pickle.load(f)"
            ]
        },
        {
            "cell_type": "markdown",
            "metadata": {},
            "source": [
                "## ***Helper functions***"
            ]
        },
        {
            "cell_type": "markdown",
            "metadata": {},
            "source": [
                "### ***Get processed text by row id***"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 5,
            "metadata": {},
            "outputs": [],
            "source": [
                "# Each sentence in the dataset has an id, and a document contain its stanza processing\n",
                "def get_doc_by_id(target_id):\n",
                "    for obj in loaded_data:\n",
                "        if obj[\"id\"] == target_id:\n",
                "            return obj[\"processed_text\"]\n",
                "    return None  # Return None if not found"
            ]
        },
        {
            "cell_type": "markdown",
            "metadata": {},
            "source": [
                "### ***Features extraction***\n",
                "**Dependency Relation Tuples:**  \n",
                "`(head_word, dependent_word, dependency_relation)`"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 6,
            "metadata": {},
            "outputs": [],
            "source": [
                "def extract_dependency_features(row_id):\n",
                "    doc = get_doc_by_id(row_id)\n",
                "    feature_tuples = []\n",
                "    for sentence in doc.sentences:\n",
                "        for word in sentence.words:\n",
                "            # Dependency relation tuples\n",
                "            if word.head > 0:  # If not root\n",
                "                head = sentence.words[word.head - 1]  # Adjust head index\n",
                "                feature_tuples.append((head.lemma, word.lemma, word.deprel))\n",
                "    return feature_tuples\n"
            ]
        },
        {
            "cell_type": "markdown",
            "metadata": {},
            "source": [
                "- **Dependency Relation Tuples:**  \n",
                "`(head_word, dependent_word, dependency_relation)`  \n",
                "- **POS Tag Tuples:**  \n",
                "`(head_pos, dependent_pos, dependency_relation)`\n",
                "- **Root Word Relation Tuple**  \n",
                "`(root_word, \"ROOT\")`"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 7,
            "metadata": {},
            "outputs": [],
            "source": [
                "def extract_full_features(row_id):\n",
                "    doc = get_doc_by_id(row_id)\n",
                "    feature_tuples = []\n",
                "    for sentence in doc.sentences:\n",
                "        for word in sentence.words:\n",
                "            # Dependency relation tuples\n",
                "            if word.head > 0:  # If not root\n",
                "                head = sentence.words[word.head - 1]  # Adjust head index\n",
                "                feature_tuples.append((head.lemma, word.lemma, word.deprel))                \n",
                "                \n",
                "                # POS tag tuples\n",
                "                feature_tuples.append((head.upos, word.upos, word.deprel))\n",
                "            \n",
                "            # Root word tuple\n",
                "            if word.deprel == \"root\":\n",
                "                feature_tuples.append((word.lemma, \"ROOT\"))\n",
                "    \n",
                "    return feature_tuples"
            ]
        },
        {
            "cell_type": "markdown",
            "metadata": {},
            "source": [
                "### ***Features extraction***\n",
                "**Dependency Relation Tuples:**  \n",
                "- `(head_word, dependent_word, dependency_relation)`\n",
                "- `n1 -> n2 grams`"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 8,
            "metadata": {},
            "outputs": [],
            "source": [
                "def extract_dependency_features_with_n_grams(row_id, n1=1, n2=2):\n",
                "    doc = get_doc_by_id(row_id)\n",
                "    feature_tuples = set()  # Use a set to avoid duplicates\n",
                "    \n",
                "    # Extract dependency relations\n",
                "    for sentence in doc.sentences:\n",
                "        for word in sentence.words:\n",
                "            # Dependency relation tuples\n",
                "            if word.head > 0:  # If not root\n",
                "                head = sentence.words[word.head - 1]  # Adjust head index\n",
                "                feature_tuples.add((head.lemma, word.lemma, word.deprel))\n",
                "    \n",
                "    # Extract n-grams from n1 to n2\n",
                "    for sentence in doc.sentences:\n",
                "        words = [word.lemma for word in sentence.words]\n",
                "        \n",
                "        for n in range(n1, n2 + 1):  # Loop from n1 to n2 inclusive\n",
                "            for i in range(len(words) - n + 1):\n",
                "                n_gram = tuple(words[i:i+n])  # Create a tuple for the n-gram\n",
                "                feature_tuples.add(n_gram)\n",
                "    \n",
                "    return feature_tuples"
            ]
        },
        {
            "cell_type": "markdown",
            "metadata": {},
            "source": [
                "#### ***Usage example***"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 9,
            "metadata": {},
            "outputs": [
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": [
                        "The skin on my palms and soles is thickened and has deep cracks. These cracks are painful and bleed easily.\n"
                    ]
                }
            ],
            "source": [
                "print(train_df[train_df[\"Id\"] == 5][\"text\"].to_list()[0])"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 10,
            "metadata": {},
            "outputs": [
                {
                    "data": {
                        "text/plain": [
                            "[('skin', 'the', 'det'),\n",
                            " ('thicken', 'skin', 'nsubj'),\n",
                            " ('palm', 'on', 'case'),\n",
                            " ('palm', 'my', 'nmod:poss'),\n",
                            " ('skin', 'palm', 'nmod'),\n",
                            " ('sole', 'and', 'cc'),\n",
                            " ('palm', 'sole', 'conj'),\n",
                            " ('thicken', 'be', 'cop'),\n",
                            " ('have', 'and', 'cc'),\n",
                            " ('thicken', 'have', 'conj'),\n",
                            " ('crack', 'deep', 'amod'),\n",
                            " ('have', 'crack', 'obj'),\n",
                            " ('thicken', '.', 'punct'),\n",
                            " ('crack', 'this', 'det'),\n",
                            " ('painful', 'crack', 'nsubj'),\n",
                            " ('painful', 'be', 'cop'),\n",
                            " ('bleed', 'and', 'cc'),\n",
                            " ('painful', 'bleed', 'conj'),\n",
                            " ('bleed', 'easily', 'advmod'),\n",
                            " ('painful', '.', 'punct')]"
                        ]
                    },
                    "execution_count": 10,
                    "metadata": {},
                    "output_type": "execute_result"
                }
            ],
            "source": [
                "extract_dependency_features(5)"
            ]
        },
        {
            "cell_type": "markdown",
            "metadata": {},
            "source": [
                "### ***Flatten tuples into strings***"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 11,
            "metadata": {},
            "outputs": [],
            "source": [
                "# Flatten tuples into strings\n",
                "def flatten_tuples(features):\n",
                "    return ['_'.join(map(str, t)) for t in features]"
            ]
        },
        {
            "cell_type": "markdown",
            "metadata": {},
            "source": [
                "### ***Features selection***\n",
                "- Using `SelectKBest`"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 12,
            "metadata": {},
            "outputs": [],
            "source": [
                "def select_features(X_train, X_test, y_train, scorer, k_value):\n",
                "\n",
                "    # Apply features selection\n",
                "    selector = SelectKBest(score_func=scorer, k=k_value)\n",
                "    X_train_selected = selector.fit_transform(X_train, y_train)\n",
                "    X_test_selected = selector.transform(X_test)\n",
                "    return X_train_selected, X_test_selected    \n"
            ]
        },
        {
            "cell_type": "markdown",
            "metadata": {},
            "source": [
                "### ***Plot train, and test accuracies vs number_of_features***\n"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 13,
            "metadata": {},
            "outputs": [],
            "source": [
                "def plot_accuracies(X_train, X_test, y_train, y_test, model):\n",
                "    train_accuracies = []\n",
                "    test_accuracies = []\n",
                "    features_counts = []\n",
                "    for i in range(500, 26501, 500):\n",
                "        X_train_selected, X_test_selected = select_features(X_train, X_test, y_train, chi2, i)\n",
                "        model.fit(X_train_selected, y_train)\n",
                "        \n",
                "        # Training set acc\n",
                "        y_train_pred = model.predict(X_train_selected)\n",
                "        train_accuracy = accuracy_score(y_train, y_train_pred)\n",
                "        train_accuracies.append(train_accuracy)\n",
                "\n",
                "        # Testing set acc\n",
                "        y_pred = model.predict(X_test_selected)\n",
                "        test_accuracy = accuracy_score(y_test, y_pred)\n",
                "        test_accuracies.append(test_accuracy)\n",
                "\n",
                "        features_counts.append(i)\n",
                "\n",
                "    # Plotting the accuracies\n",
                "    plt.figure(figsize=(10, 6))\n",
                "    plt.plot(features_counts, train_accuracies, label='Train Accuracy', marker='.')\n",
                "    plt.plot(features_counts, test_accuracies, label='Test Accuracy', marker='.')\n",
                "    plt.title('Train and Test Accuracy vs Number of Features')\n",
                "    plt.xlabel('Number of Features Selected')\n",
                "    plt.ylabel('Accuracy')\n",
                "    plt.legend()\n",
                "    plt.grid()\n",
                "\n",
                "    # Finding closest points (3)\n",
                "    differences = np.abs(np.array(train_accuracies) - np.array(test_accuracies))\n",
                "    closest_indices = np.argsort(differences)[:3]  # indices of the three smallest differences\n",
                "    colors = ['darkgreen', 'mediumseagreen', 'lightgreen']\n",
                "\n",
                "    # Draw a rect\n",
                "    for i, idx in enumerate(closest_indices):\n",
                "        x = features_counts[idx] - 5\n",
                "        y_bottom = min(train_accuracies[idx], test_accuracies[idx])\n",
                "        y_top = max(train_accuracies[idx], test_accuracies[idx])\n",
                "        height = y_top - y_bottom\n",
                "\n",
                "        plt.gca().add_patch(plt.Rectangle(\n",
                "            (x, y_bottom), 10, height,\n",
                "            color=colors[i], alpha=0.5\n",
                "        ))\n",
                "        \n",
                "        # Print the number of selected features for each closest point\n",
                "        print(f\"Closest Point {i+1}: Number of Features = {features_counts[idx]}, Train Accuracy = {train_accuracies[idx]}, Test Accuracy = {test_accuracies[idx]}\")\n",
                "\n",
                "\n",
                "    plt.show()    "
            ]
        },
        {
            "cell_type": "markdown",
            "metadata": {},
            "source": [
                "### ***Evaluate model***\n"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 14,
            "metadata": {},
            "outputs": [],
            "source": [
                "def print_clf_report_as_table(report):\n",
                "    data = []\n",
                "    for key, value in report.items():\n",
                "        if key != 'accuracy' and key != 'macro avg' and key != 'weighted avg':\n",
                "            data.append([key, value['precision'], value['recall'], value['f1-score'], value['support']])\n",
                "\n",
                "    data.append(['accuracy', '', '', report['accuracy'], ''])\n",
                "\n",
                "    data.append(['macro avg', report['macro avg']['precision'], report['macro avg']['recall'], report['macro avg']['f1-score'], ''])\n",
                "\n",
                "    data.append(['weighted avg', report['weighted avg']['precision'], report['weighted avg']['recall'], report['weighted avg']['f1-score'], ''])\n",
                "\n",
                "    print(tabulate(data, headers=['Class', 'Precision', 'Recall', 'F1-score', 'Support'], tablefmt='psql'))"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 15,
            "metadata": {},
            "outputs": [],
            "source": [
                "def evaluate_model(X_train, X_test, y_train, y_test, scorer, k_value, model):\n",
                "    X_train_selected, X_test_selected = select_features(X_train, X_test, y_train, scorer, k_value)\n",
                "    model.fit(X_train_selected, y_train)\n",
                "        \n",
                "    # Training set acc\n",
                "    y_train_pred = model.predict(X_train_selected)\n",
                "    train_accuracy = accuracy_score(y_train, y_train_pred)\n",
                "\n",
                "    # Testing set acc\n",
                "    y_pred = model.predict(X_test_selected)\n",
                "    test_accuracy = accuracy_score(y_test, y_pred)\n",
                "\n",
                "    print(f'Train Accuracy: {train_accuracy}')\n",
                "    print(f'Test Accuracy: {test_accuracy}')\n",
                "    print(f'Difference: {train_accuracy-test_accuracy}')    \n",
                "    # Print classification report\n",
                "    report = classification_report(y_test, y_pred, output_dict=True)\n",
                "    print_clf_report_as_table(report)"
            ]
        },
        {
            "cell_type": "markdown",
            "metadata": {},
            "source": [
                "# ***Use Dependency Relation Features***"
            ]
        },
        {
            "cell_type": "markdown",
            "metadata": {},
            "source": [
                "## ***Apply features extraction on the dataset***"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 16,
            "metadata": {},
            "outputs": [],
            "source": [
                "train_df[\"features\"] = train_df[\"Id\"].apply(extract_dependency_features)\n",
                "test_df[\"features\"] = test_df[\"Id\"].apply(extract_dependency_features)"
            ]
        },
        {
            "cell_type": "markdown",
            "metadata": {},
            "source": [
                "### ***Flatten and Vectorize Tuples***"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 17,
            "metadata": {},
            "outputs": [],
            "source": [
                "all_features_train = train_df['features'].apply(flatten_tuples)\n",
                "all_features_flat_train = [' '.join(features) for features in all_features_train]\n",
                "\n",
                "all_features_test = test_df['features'].apply(flatten_tuples)\n",
                "all_features_flat_test = [' '.join(features) for features in all_features_test]"
            ]
        },
        {
            "cell_type": "markdown",
            "metadata": {},
            "source": [
                "#### ***Vectorize the features***\n",
                "- **This will be done using `TF-IDF`**"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 18,
            "metadata": {},
            "outputs": [
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": [
                        "Train set shape after features extraction: (4320, 26869)\n",
                        "Test set shape after features extraction: (480, 26869)\n"
                    ]
                }
            ],
            "source": [
                "vectorizer = TfidfVectorizer()\n",
                "X_train = vectorizer.fit_transform(all_features_flat_train)\n",
                "X_test = vectorizer.transform(all_features_flat_test)\n",
                "print(f\"Train set shape after features extraction: {X_train.shape}\")\n",
                "print(f\"Test set shape after features extraction: {X_test.shape}\")"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 19,
            "metadata": {},
            "outputs": [],
            "source": [
                "y_train = train_df[\"label\"]\n",
                "y_test = test_df[\"label\"]"
            ]
        },
        {
            "cell_type": "markdown",
            "metadata": {},
            "source": [
                "## ***Select best number of features***"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 20,
            "metadata": {},
            "outputs": [
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": [
                        "Closest Point 1: Number of Features = 500, Train Accuracy = 0.7800925925925926, Test Accuracy = 0.7166666666666667\n",
                        "Closest Point 2: Number of Features = 1000, Train Accuracy = 0.8694444444444445, Test Accuracy = 0.79375\n",
                        "Closest Point 3: Number of Features = 20000, Train Accuracy = 0.9944444444444445, Test Accuracy = 0.8979166666666667\n"
                    ]
                },
                {
                    "data": {
                        "image/png": "",
                        "text/plain": [
                            "<Figure size 1000x600 with 1 Axes>"
                        ]
                    },
                    "metadata": {},
                    "output_type": "display_data"
                }
            ],
            "source": [
                "model = MultinomialNB()\n",
                "plot_accuracies(X_train, X_test, y_train, y_test, model)"
            ]
        },
        {
            "cell_type": "markdown",
            "metadata": {},
            "source": [
                "### ***Tune model's hyperparameters to reduce OF***\n"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 21,
            "metadata": {},
            "outputs": [
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": [
                        "Closest Point 1: Number of Features = 17500, Train Accuracy = 1.0, Test Accuracy = 0.9604166666666667\n",
                        "Closest Point 2: Number of Features = 18000, Train Accuracy = 1.0, Test Accuracy = 0.9604166666666667\n",
                        "Closest Point 3: Number of Features = 20500, Train Accuracy = 0.9997685185185186, Test Accuracy = 0.9583333333333334\n"
                    ]
                },
                {
                    "data": {
                        "image/png": "",
                        "text/plain": [
                            "<Figure size 1000x600 with 1 Axes>"
                        ]
                    },
                    "metadata": {},
                    "output_type": "display_data"
                }
            ],
            "source": [
                "model = MultinomialNB(alpha=0.01)\n",
                "plot_accuracies(X_train, X_test, y_train, y_test, model)"
            ]
        },
        {
            "cell_type": "markdown",
            "metadata": {},
            "source": [
                "## ***Evaluate best model***"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 22,
            "metadata": {},
            "outputs": [
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": [
                        "Train Accuracy: 0.999537037037037\n",
                        "Test Accuracy: 0.9520833333333333\n",
                        "Difference: 0.04745370370370372\n",
                        "+---------------------------------+--------------------+--------------------+------------+-----------+\n",
                        "| Class                           | Precision          | Recall             |   F1-score | Support   |\n",
                        "|---------------------------------+--------------------+--------------------+------------+-----------|\n",
                        "| Acne                            | 1.0                | 1.0                |   1        | 21.0      |\n",
                        "| Arthritis                       | 1.0                | 1.0                |   1        | 20.0      |\n",
                        "| Bronchial Asthma                | 0.9444444444444444 | 0.8947368421052632 |   0.918919 | 19.0      |\n",
                        "| Cervical spondylosis            | 0.875              | 1.0                |   0.933333 | 21.0      |\n",
                        "| Chicken pox                     | 0.7142857142857143 | 1.0                |   0.833333 | 15.0      |\n",
                        "| Common Cold                     | 1.0                | 1.0                |   1        | 21.0      |\n",
                        "| Dengue                          | 0.8823529411764706 | 0.6818181818181818 |   0.769231 | 22.0      |\n",
                        "| Dimorphic Hemorrhoids           | 1.0                | 1.0                |   1        | 19.0      |\n",
                        "| Fungal infection                | 1.0                | 1.0                |   1        | 26.0      |\n",
                        "| Hypertension                    | 0.9444444444444444 | 0.9444444444444444 |   0.944444 | 18.0      |\n",
                        "| Impetigo                        | 0.9583333333333334 | 1.0                |   0.978723 | 23.0      |\n",
                        "| Jaundice                        | 1.0                | 1.0                |   1        | 22.0      |\n",
                        "| Malaria                         | 1.0                | 1.0                |   1        | 17.0      |\n",
                        "| Migraine                        | 1.0                | 0.9583333333333334 |   0.978723 | 24.0      |\n",
                        "| Pneumonia                       | 0.9565217391304348 | 1.0                |   0.977778 | 22.0      |\n",
                        "| Psoriasis                       | 0.9375             | 0.8823529411764706 |   0.909091 | 17.0      |\n",
                        "| Typhoid                         | 0.9                | 1.0                |   0.947368 | 18.0      |\n",
                        "| Varicose Veins                  | 1.0                | 0.96               |   0.979592 | 25.0      |\n",
                        "| allergy                         | 0.875              | 0.9333333333333333 |   0.903226 | 15.0      |\n",
                        "| diabetes                        | 0.9333333333333333 | 0.8235294117647058 |   0.875    | 17.0      |\n",
                        "| drug reaction                   | 1.0                | 0.875              |   0.933333 | 16.0      |\n",
                        "| gastroesophageal reflux disease | 0.9523809523809523 | 0.9523809523809523 |   0.952381 | 21.0      |\n",
                        "| peptic ulcer disease            | 1.0                | 0.9444444444444444 |   0.971429 | 18.0      |\n",
                        "| urinary tract infection         | 0.9565217391304348 | 0.9565217391304348 |   0.956522 | 23.0      |\n",
                        "| accuracy                        |                    |                    |   0.952083 |           |\n",
                        "| macro avg                       | 0.9512549434024818 | 0.9502873176638152 |   0.948434 |           |\n",
                        "| weighted avg                    | 0.9555412634558654 | 0.9520833333333333 |   0.951736 |           |\n",
                        "+---------------------------------+--------------------+--------------------+------------+-----------+\n"
                    ]
                }
            ],
            "source": [
                "model = MultinomialNB(alpha=0.01)\n",
                "evaluate_model(X_train, X_test, y_train, y_test, chi2, 8000, model)"
            ]
        },
        {
            "cell_type": "markdown",
            "metadata": {},
            "source": [
                "# ***Use More features***\n",
                "- **Dependency relations**\n",
                "- **POS tags with relations**\n",
                "- **Head words**\n"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 23,
            "metadata": {},
            "outputs": [
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": [
                        "Train set shape after features extraction: (4320, 27622)\n",
                        "Test set shape after features extraction: (480, 27622)\n"
                    ]
                }
            ],
            "source": [
                "train_df[\"features\"] = train_df[\"Id\"].apply(extract_full_features)\n",
                "test_df[\"features\"] = test_df[\"Id\"].apply(extract_full_features)\n",
                "\n",
                "all_features_train = train_df['features'].apply(flatten_tuples)\n",
                "all_features_flat_train = [' '.join(features) for features in all_features_train]\n",
                "all_features_test = test_df['features'].apply(flatten_tuples)\n",
                "all_features_flat_test = [' '.join(features) for features in all_features_test]\n",
                "\n",
                "vectorizer = TfidfVectorizer()\n",
                "X_train = vectorizer.fit_transform(all_features_flat_train)\n",
                "X_test = vectorizer.transform(all_features_flat_test)\n",
                "print(f\"Train set shape after features extraction: {X_train.shape}\")\n",
                "print(f\"Test set shape after features extraction: {X_test.shape}\")\n",
                "\n",
                "y_train = train_df[\"label\"]\n",
                "y_test = test_df[\"label\"]"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 24,
            "metadata": {},
            "outputs": [
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": [
                        "Closest Point 1: Number of Features = 25500, Train Accuracy = 1.0, Test Accuracy = 0.95625\n",
                        "Closest Point 2: Number of Features = 23500, Train Accuracy = 0.9997685185185186, Test Accuracy = 0.9541666666666667\n",
                        "Closest Point 3: Number of Features = 23000, Train Accuracy = 0.9997685185185186, Test Accuracy = 0.9541666666666667\n"
                    ]
                },
                {
                    "data": {
                        "image/png": "",
                        "text/plain": [
                            "<Figure size 1000x600 with 1 Axes>"
                        ]
                    },
                    "metadata": {},
                    "output_type": "display_data"
                }
            ],
            "source": [
                "model = MultinomialNB(alpha=0.01)\n",
                "plot_accuracies(X_train, X_test, y_train, y_test, model)"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 25,
            "metadata": {},
            "outputs": [
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": [
                        "Train Accuracy: 0.9986111111111111\n",
                        "Test Accuracy: 0.9479166666666666\n",
                        "Difference: 0.050694444444444486\n",
                        "+---------------------------------+--------------------+--------------------+------------+-----------+\n",
                        "| Class                           | Precision          | Recall             |   F1-score | Support   |\n",
                        "|---------------------------------+--------------------+--------------------+------------+-----------|\n",
                        "| Acne                            | 1.0                | 1.0                |   1        | 21.0      |\n",
                        "| Arthritis                       | 1.0                | 1.0                |   1        | 20.0      |\n",
                        "| Bronchial Asthma                | 1.0                | 0.8947368421052632 |   0.944444 | 19.0      |\n",
                        "| Cervical spondylosis            | 0.875              | 1.0                |   0.933333 | 21.0      |\n",
                        "| Chicken pox                     | 0.6818181818181818 | 1.0                |   0.810811 | 15.0      |\n",
                        "| Common Cold                     | 0.875              | 1.0                |   0.933333 | 21.0      |\n",
                        "| Dengue                          | 0.9333333333333333 | 0.6363636363636364 |   0.756757 | 22.0      |\n",
                        "| Dimorphic Hemorrhoids           | 1.0                | 1.0                |   1        | 19.0      |\n",
                        "| Fungal infection                | 1.0                | 1.0                |   1        | 26.0      |\n",
                        "| Hypertension                    | 1.0                | 0.8333333333333334 |   0.909091 | 18.0      |\n",
                        "| Impetigo                        | 0.9583333333333334 | 1.0                |   0.978723 | 23.0      |\n",
                        "| Jaundice                        | 1.0                | 1.0                |   1        | 22.0      |\n",
                        "| Malaria                         | 1.0                | 1.0                |   1        | 17.0      |\n",
                        "| Migraine                        | 1.0                | 0.9166666666666666 |   0.956522 | 24.0      |\n",
                        "| Pneumonia                       | 1.0                | 1.0                |   1        | 22.0      |\n",
                        "| Psoriasis                       | 0.9411764705882353 | 0.9411764705882353 |   0.941176 | 17.0      |\n",
                        "| Typhoid                         | 0.9                | 1.0                |   0.947368 | 18.0      |\n",
                        "| Varicose Veins                  | 1.0                | 0.96               |   0.979592 | 25.0      |\n",
                        "| allergy                         | 0.8666666666666667 | 0.8666666666666667 |   0.866667 | 15.0      |\n",
                        "| diabetes                        | 0.8823529411764706 | 0.8823529411764706 |   0.882353 | 17.0      |\n",
                        "| drug reaction                   | 0.9375             | 0.9375             |   0.9375   | 16.0      |\n",
                        "| gastroesophageal reflux disease | 1.0                | 0.9047619047619048 |   0.95     | 21.0      |\n",
                        "| peptic ulcer disease            | 0.9473684210526315 | 1.0                |   0.972973 | 18.0      |\n",
                        "| urinary tract infection         | 0.9565217391304348 | 0.9565217391304348 |   0.956522 | 23.0      |\n",
                        "| accuracy                        |                    |                    |   0.947917 |           |\n",
                        "| macro avg                       | 0.9481279619624704 | 0.9470866750330255 |   0.944049 |           |\n",
                        "| weighted avg                    | 0.9537602173046252 | 0.9479166666666666 |   0.947483 |           |\n",
                        "+---------------------------------+--------------------+--------------------+------------+-----------+\n"
                    ]
                }
            ],
            "source": [
                "model = MultinomialNB(alpha=0.01)\n",
                "evaluate_model(X_train, X_test, y_train, y_test, chi2, 7500, model)"
            ]
        },
        {
            "cell_type": "markdown",
            "metadata": {},
            "source": [
                "# ***Use Dependecy features with N-Grams***\n"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 26,
            "metadata": {},
            "outputs": [
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": [
                        "Train set shape after features extraction: (4320, 84052)\n",
                        "Test set shape after features extraction: (480, 84052)\n"
                    ]
                }
            ],
            "source": [
                "n1 = 1\n",
                "n2 = 3\n",
                "def extractor(row_id):\n",
                "    return extract_dependency_features_with_n_grams(row_id, n1, n2)\n",
                "\n",
                "\n",
                "train_df[\"features\"] = train_df[\"Id\"].apply(extractor)\n",
                "test_df[\"features\"] = test_df[\"Id\"].apply(extractor)\n",
                "\n",
                "all_features_train = train_df['features'].apply(flatten_tuples)\n",
                "all_features_flat_train = [' '.join(features) for features in all_features_train]\n",
                "all_features_test = test_df['features'].apply(flatten_tuples)\n",
                "all_features_flat_test = [' '.join(features) for features in all_features_test]\n",
                "\n",
                "vectorizer = TfidfVectorizer()\n",
                "X_train = vectorizer.fit_transform(all_features_flat_train)\n",
                "X_test = vectorizer.transform(all_features_flat_test)\n",
                "print(f\"Train set shape after features extraction: {X_train.shape}\")\n",
                "print(f\"Test set shape after features extraction: {X_test.shape}\")\n",
                "\n",
                "y_train = train_df[\"label\"]\n",
                "y_test = test_df[\"label\"]"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 27,
            "metadata": {},
            "outputs": [
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": [
                        "Closest Point 1: Number of Features = 66000, Train Accuracy = 1.0, Test Accuracy = 0.975\n",
                        "Closest Point 2: Number of Features = 68000, Train Accuracy = 1.0, Test Accuracy = 0.975\n",
                        "Closest Point 3: Number of Features = 67000, Train Accuracy = 1.0, Test Accuracy = 0.975\n"
                    ]
                },
                {
                    "data": {
                        "image/png": "",
                        "text/plain": [
                            "<Figure size 1000x600 with 1 Axes>"
                        ]
                    },
                    "metadata": {},
                    "output_type": "display_data"
                }
            ],
            "source": [
                "# Make sure to update step in plotting before running this cell\n",
                "model = MultinomialNB(alpha = 0.01)\n",
                "plot_accuracies(X_train, X_test, y_train, y_test, model)"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 28,
            "metadata": {},
            "outputs": [
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": [
                        "Train Accuracy: 1.0\n",
                        "Test Accuracy: 0.975\n",
                        "Difference: 0.025000000000000022\n",
                        "+---------------------------------+--------------------+--------------------+------------+-----------+\n",
                        "| Class                           | Precision          | Recall             |   F1-score | Support   |\n",
                        "|---------------------------------+--------------------+--------------------+------------+-----------|\n",
                        "| Acne                            | 1.0                | 1.0                |   1        | 21.0      |\n",
                        "| Arthritis                       | 1.0                | 1.0                |   1        | 20.0      |\n",
                        "| Bronchial Asthma                | 1.0                | 0.9473684210526315 |   0.972973 | 19.0      |\n",
                        "| Cervical spondylosis            | 0.9130434782608695 | 1.0                |   0.954545 | 21.0      |\n",
                        "| Chicken pox                     | 0.75               | 1.0                |   0.857143 | 15.0      |\n",
                        "| Common Cold                     | 1.0                | 1.0                |   1        | 21.0      |\n",
                        "| Dengue                          | 1.0                | 0.7727272727272727 |   0.871795 | 22.0      |\n",
                        "| Dimorphic Hemorrhoids           | 1.0                | 1.0                |   1        | 19.0      |\n",
                        "| Fungal infection                | 1.0                | 1.0                |   1        | 26.0      |\n",
                        "| Hypertension                    | 1.0                | 0.9444444444444444 |   0.971429 | 18.0      |\n",
                        "| Impetigo                        | 0.9583333333333334 | 1.0                |   0.978723 | 23.0      |\n",
                        "| Jaundice                        | 1.0                | 1.0                |   1        | 22.0      |\n",
                        "| Malaria                         | 1.0                | 1.0                |   1        | 17.0      |\n",
                        "| Migraine                        | 1.0                | 1.0                |   1        | 24.0      |\n",
                        "| Pneumonia                       | 1.0                | 1.0                |   1        | 22.0      |\n",
                        "| Psoriasis                       | 1.0                | 0.8823529411764706 |   0.9375   | 17.0      |\n",
                        "| Typhoid                         | 0.9473684210526315 | 1.0                |   0.972973 | 18.0      |\n",
                        "| Varicose Veins                  | 1.0                | 0.96               |   0.979592 | 25.0      |\n",
                        "| allergy                         | 0.9375             | 1.0                |   0.967742 | 15.0      |\n",
                        "| diabetes                        | 1.0                | 0.9411764705882353 |   0.969697 | 17.0      |\n",
                        "| drug reaction                   | 1.0                | 1.0                |   1        | 16.0      |\n",
                        "| gastroesophageal reflux disease | 0.9545454545454546 | 1.0                |   0.976744 | 21.0      |\n",
                        "| peptic ulcer disease            | 1.0                | 0.9444444444444444 |   0.971429 | 18.0      |\n",
                        "| urinary tract infection         | 0.9583333333333334 | 1.0                |   0.978723 | 23.0      |\n",
                        "| accuracy                        |                    |                    |   0.975    |           |\n",
                        "| macro avg                       | 0.9757968341885676 | 0.9746880831013959 |   0.973375 |           |\n",
                        "| weighted avg                    | 0.9784746510441948 | 0.975              |   0.975031 |           |\n",
                        "+---------------------------------+--------------------+--------------------+------------+-----------+\n"
                    ]
                }
            ],
            "source": [
                "model = MultinomialNB(alpha=0.01)\n",
                "evaluate_model(X_train, X_test, y_train, y_test, chi2, 66000, model)"
            ]
        }
    ],
    "metadata": {
        "kernelspec": {
            "display_name": "NLP",
            "language": "python",
            "name": "python3"
        },
        "language_info": {
            "codemirror_mode": {
                "name": "ipython",
                "version": 3
            },
            "file_extension": ".py",
            "mimetype": "text/x-python",
            "name": "python",
            "nbconvert_exporter": "python",
            "pygments_lexer": "ipython3",
            "version": "3.9.20"
        }
936
    },
937 938
    "nbformat": 4,
    "nbformat_minor": 2
939
}