(5) Add initial stanza pipeline results

0145d10f · Almouhannad Hafez · 80280f8d · 0145d10f · 0145d10f · 0145d10f
Commit 0145d10f authored Nov 16, 2024 by Almouhannad Hafez
Show whitespace changes
Inline Side-by-side

Showing with 131 additions and 1 deletion

5.0.Process_texts_stanza.ipynb 5.0.Process_texts_stanza.ipynb +128 -0

constants.py constants.py +3 -1

lemmatized_texts.pkl stanza/lemmatized_texts.pkl +0 -0

No files found.
--- a/5.0.Process_texts_stanza.ipynb
+++ b/5.0.Process_texts_stanza.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# ***Setup***"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\TOSHIBA\\AppData\\Roaming\\Python\\Python39\\site-packages\\networkx\\utils\\backends.py:135: RuntimeWarning: networkx backend defined more than once: nx-loopback\n",
+      "  backends.update(_get_backends(\"networkx.backends\"))\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "import stanza\n",
+    "\n",
+    "import pickle # Module to store python objects\n",
+    "\n",
+    "from constants import CONSTANTS"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## ***Load dataset***"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv(CONSTANTS.AUGMENTED_DATASET_PATH)\n",
+    "assert df.shape == (7200, 3), f\"Expected shape (7200, 3), but got {df.shape}\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## ***Define stanza pipeline***"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma', verbose=False)\n",
+    "    # mwt = multi word tokenization"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# ***Process dataset using stanza pipeline***"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "texts = df[\"text\"].to_list()\n",
+    "ids = df[\"Id\"].to_list()\n",
+    "docs = []\n",
+    "for i in range(len(ids)):\n",
+    "    dict = {'id': ids[i], 'processed_text': nlp(texts[i])}\n",
+    "    docs.append(dict)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# ***Store processed texts***"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(CONSTANTS.LEMMATIZED_TEXTS_OBJECT_PATH, 'wb') as f:\n",
+    "    pickle.dump(docs, f)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.20"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/constants.py
+++ b/constants.py
@@ -5,3 +5,5 @@ class CONSTANTS:
    AUGMENTED_DATASET_PATH = 'data/augmented_Symptom2Disease.csv'
    AUGMENTED_TRAIN_SET_PATH = 'data/augmented_Preprocessed_Symptom2Disease_Train.csv'
    AUGMENTED_TEST_SET_PATH = 'data/augmented_Preprocessed_Symptom2Disease_Test.csv'
+    LEMMATIZED_TEXTS_OBJECT_PATH = 'stanza/lemmatized_texts.pkl'
\ No newline at end of file
--- a/stanza/lemmatized_texts.pkl
+++ b/stanza/lemmatized_texts.pkl