(5) Add dep. parsing

ddcda0d8 · Almouhannad Hafez · 50aa3ab7 · ddcda0d8 · ddcda0d8 · ddcda0d8
Commit ddcda0d8 authored Nov 16, 2024 by Almouhannad Hafez
Hide whitespace changes
Inline Side-by-side

Showing with 483 additions and 12 deletions

5.0.Process_texts_stanza.ipynb 5.0.Process_texts_stanza.ipynb +481 -11

constants.py constants.py +2 -1

dep_parsed_texts.pkl stanza/dep_parsed_texts.pkl +0 -0

No files found.
--- a/5.0.Process_texts_stanza.ipynb
+++ b/5.0.Process_texts_stanza.ipynb
@@ -104,20 +104,20 @@
   ]
  },
  {
-   "cell_type": "code",
-   "execution_count": 6,
+   "cell_type": "markdown",
   "metadata": {},
-   "outputs": [],
   "source": [
-    "with open(CONSTANTS.LEMMATIZED_TEXTS_OBJECT_PATH, 'rb') as f:\n",
-    "    loaded_data = pickle.load(f)"
+    "# ***Load processed texts***\n"
   ]
  },
  {
-   "cell_type": "markdown",
+   "cell_type": "code",
+   "execution_count": 6,
   "metadata": {},
+   "outputs": [],
   "source": [
-    "# ***Load processed texts***"
+    "with open(CONSTANTS.LEMMATIZED_TEXTS_OBJECT_PATH, 'rb') as f:\n",
+    "    loaded_data = pickle.load(f)"
   ]
  },
  {
@@ -522,13 +522,485 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "# ***Apply new pipeline step***"
+    "# ***Apply new pipeline step (Dep. parsing)***"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
+   "outputs": [],
+   "source": [
+    "nlp = stanza.Pipeline(lang='en', processors='depparse', depparse_pretagged=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ids = df[\"Id\"].to_list()\n",
+    "docs = []\n",
+    "for i in range(len(ids)):\n",
+    "    dict = {'id': ids[i], 'processed_text': nlp(get_doc_by_id(ids[i]))}\n",
+    "    docs.append(dict)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(CONSTANTS.DEP_PARSED_TEXTS_OBJECT_PATH, 'wb') as f:\n",
+    "    pickle.dump(docs, f)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "I have been experiencing a skin rash on my arms, legs, and torso for the past few weeks. It is red, itchy, and covered in dry, scaly patches.\n",
+      "[[\n",
+      "  {\n",
+      "    \"id\": 1,\n",
+      "    \"text\": \"I\",\n",
+      "    \"lemma\": \"I\",\n",
+      "    \"upos\": \"PRON\",\n",
+      "    \"xpos\": \"PRP\",\n",
+      "    \"feats\": \"Case=Nom|Number=Sing|Person=1|PronType=Prs\",\n",
+      "    \"head\": 4,\n",
+      "    \"deprel\": \"nsubj\",\n",
+      "    \"start_char\": 0,\n",
+      "    \"end_char\": 1\n",
+      "  },\n",
+      "  {\n",
+      "    \"id\": 2,\n",
+      "    \"text\": \"have\",\n",
+      "    \"lemma\": \"have\",\n",
+      "    \"upos\": \"AUX\",\n",
+      "    \"xpos\": \"VBP\",\n",
+      "    \"feats\": \"Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin\",\n",
+      "    \"head\": 4,\n",
+      "    \"deprel\": \"aux\",\n",
+      "    \"start_char\": 2,\n",
+      "    \"end_char\": 6\n",
+      "  },\n",
+      "  {\n",
+      "    \"id\": 3,\n",
+      "    \"text\": \"been\",\n",
+      "    \"lemma\": \"be\",\n",
+      "    \"upos\": \"AUX\",\n",
+      "    \"xpos\": \"VBN\",\n",
+      "    \"feats\": \"Tense=Past|VerbForm=Part\",\n",
+      "    \"head\": 4,\n",
+      "    \"deprel\": \"aux\",\n",
+      "    \"start_char\": 7,\n",
+      "    \"end_char\": 11\n",
+      "  },\n",
+      "  {\n",
+      "    \"id\": 4,\n",
+      "    \"text\": \"experiencing\",\n",
+      "    \"lemma\": \"experience\",\n",
+      "    \"upos\": \"VERB\",\n",
+      "    \"xpos\": \"VBG\",\n",
+      "    \"feats\": \"Tense=Pres|VerbForm=Part\",\n",
+      "    \"head\": 0,\n",
+      "    \"deprel\": \"root\",\n",
+      "    \"start_char\": 12,\n",
+      "    \"end_char\": 24\n",
+      "  },\n",
+      "  {\n",
+      "    \"id\": 5,\n",
+      "    \"text\": \"a\",\n",
+      "    \"lemma\": \"a\",\n",
+      "    \"upos\": \"DET\",\n",
+      "    \"xpos\": \"DT\",\n",
+      "    \"feats\": \"Definite=Ind|PronType=Art\",\n",
+      "    \"head\": 7,\n",
+      "    \"deprel\": \"det\",\n",
+      "    \"start_char\": 25,\n",
+      "    \"end_char\": 26\n",
+      "  },\n",
+      "  {\n",
+      "    \"id\": 6,\n",
+      "    \"text\": \"skin\",\n",
+      "    \"lemma\": \"skin\",\n",
+      "    \"upos\": \"NOUN\",\n",
+      "    \"xpos\": \"NN\",\n",
+      "    \"feats\": \"Number=Sing\",\n",
+      "    \"head\": 7,\n",
+      "    \"deprel\": \"compound\",\n",
+      "    \"start_char\": 27,\n",
+      "    \"end_char\": 31\n",
+      "  },\n",
+      "  {\n",
+      "    \"id\": 7,\n",
+      "    \"text\": \"rash\",\n",
+      "    \"lemma\": \"rash\",\n",
+      "    \"upos\": \"NOUN\",\n",
+      "    \"xpos\": \"NN\",\n",
+      "    \"feats\": \"Number=Sing\",\n",
+      "    \"head\": 4,\n",
+      "    \"deprel\": \"obj\",\n",
+      "    \"start_char\": 32,\n",
+      "    \"end_char\": 36\n",
+      "  },\n",
+      "  {\n",
+      "    \"id\": 8,\n",
+      "    \"text\": \"on\",\n",
+      "    \"lemma\": \"on\",\n",
+      "    \"upos\": \"ADP\",\n",
+      "    \"xpos\": \"IN\",\n",
+      "    \"head\": 10,\n",
+      "    \"deprel\": \"case\",\n",
+      "    \"start_char\": 37,\n",
+      "    \"end_char\": 39\n",
+      "  },\n",
+      "  {\n",
+      "    \"id\": 9,\n",
+      "    \"text\": \"my\",\n",
+      "    \"lemma\": \"my\",\n",
+      "    \"upos\": \"PRON\",\n",
+      "    \"xpos\": \"PRP$\",\n",
+      "    \"feats\": \"Case=Gen|Number=Sing|Person=1|Poss=Yes|PronType=Prs\",\n",
+      "    \"head\": 10,\n",
+      "    \"deprel\": \"nmod:poss\",\n",
+      "    \"start_char\": 40,\n",
+      "    \"end_char\": 42\n",
+      "  },\n",
+      "  {\n",
+      "    \"id\": 10,\n",
+      "    \"text\": \"arms\",\n",
+      "    \"lemma\": \"arm\",\n",
+      "    \"upos\": \"NOUN\",\n",
+      "    \"xpos\": \"NNS\",\n",
+      "    \"feats\": \"Number=Plur\",\n",
+      "    \"head\": 7,\n",
+      "    \"deprel\": \"nmod\",\n",
+      "    \"start_char\": 43,\n",
+      "    \"end_char\": 47,\n",
+      "    \"misc\": \"SpaceAfter=No\"\n",
+      "  },\n",
+      "  {\n",
+      "    \"id\": 11,\n",
+      "    \"text\": \",\",\n",
+      "    \"lemma\": \",\",\n",
+      "    \"upos\": \"PUNCT\",\n",
+      "    \"xpos\": \",\",\n",
+      "    \"head\": 12,\n",
+      "    \"deprel\": \"punct\",\n",
+      "    \"start_char\": 47,\n",
+      "    \"end_char\": 48\n",
+      "  },\n",
+      "  {\n",
+      "    \"id\": 12,\n",
+      "    \"text\": \"legs\",\n",
+      "    \"lemma\": \"leg\",\n",
+      "    \"upos\": \"NOUN\",\n",
+      "    \"xpos\": \"NNS\",\n",
+      "    \"feats\": \"Number=Plur\",\n",
+      "    \"head\": 10,\n",
+      "    \"deprel\": \"conj\",\n",
+      "    \"start_char\": 49,\n",
+      "    \"end_char\": 53,\n",
+      "    \"misc\": \"SpaceAfter=No\"\n",
+      "  },\n",
+      "  {\n",
+      "    \"id\": 13,\n",
+      "    \"text\": \",\",\n",
+      "    \"lemma\": \",\",\n",
+      "    \"upos\": \"PUNCT\",\n",
+      "    \"xpos\": \",\",\n",
+      "    \"head\": 15,\n",
+      "    \"deprel\": \"punct\",\n",
+      "    \"start_char\": 53,\n",
+      "    \"end_char\": 54\n",
+      "  },\n",
+      "  {\n",
+      "    \"id\": 14,\n",
+      "    \"text\": \"and\",\n",
+      "    \"lemma\": \"and\",\n",
+      "    \"upos\": \"CCONJ\",\n",
+      "    \"xpos\": \"CC\",\n",
+      "    \"head\": 15,\n",
+      "    \"deprel\": \"cc\",\n",
+      "    \"start_char\": 55,\n",
+      "    \"end_char\": 58\n",
+      "  },\n",
+      "  {\n",
+      "    \"id\": 15,\n",
+      "    \"text\": \"torso\",\n",
+      "    \"lemma\": \"torso\",\n",
+      "    \"upos\": \"NOUN\",\n",
+      "    \"xpos\": \"NN\",\n",
+      "    \"feats\": \"Number=Sing\",\n",
+      "    \"head\": 10,\n",
+      "    \"deprel\": \"conj\",\n",
+      "    \"start_char\": 59,\n",
+      "    \"end_char\": 64\n",
+      "  },\n",
+      "  {\n",
+      "    \"id\": 16,\n",
+      "    \"text\": \"for\",\n",
+      "    \"lemma\": \"for\",\n",
+      "    \"upos\": \"ADP\",\n",
+      "    \"xpos\": \"IN\",\n",
+      "    \"head\": 20,\n",
+      "    \"deprel\": \"case\",\n",
+      "    \"start_char\": 65,\n",
+      "    \"end_char\": 68\n",
+      "  },\n",
+      "  {\n",
+      "    \"id\": 17,\n",
+      "    \"text\": \"the\",\n",
+      "    \"lemma\": \"the\",\n",
+      "    \"upos\": \"DET\",\n",
+      "    \"xpos\": \"DT\",\n",
+      "    \"feats\": \"Definite=Def|PronType=Art\",\n",
+      "    \"head\": 20,\n",
+      "    \"deprel\": \"det\",\n",
+      "    \"start_char\": 69,\n",
+      "    \"end_char\": 72\n",
+      "  },\n",
+      "  {\n",
+      "    \"id\": 18,\n",
+      "    \"text\": \"past\",\n",
+      "    \"lemma\": \"past\",\n",
+      "    \"upos\": \"ADJ\",\n",
+      "    \"xpos\": \"JJ\",\n",
+      "    \"feats\": \"Degree=Pos\",\n",
+      "    \"head\": 20,\n",
+      "    \"deprel\": \"amod\",\n",
+      "    \"start_char\": 73,\n",
+      "    \"end_char\": 77\n",
+      "  },\n",
+      "  {\n",
+      "    \"id\": 19,\n",
+      "    \"text\": \"few\",\n",
+      "    \"lemma\": \"few\",\n",
+      "    \"upos\": \"ADJ\",\n",
+      "    \"xpos\": \"JJ\",\n",
+      "    \"feats\": \"Degree=Pos\",\n",
+      "    \"head\": 20,\n",
+      "    \"deprel\": \"amod\",\n",
+      "    \"start_char\": 78,\n",
+      "    \"end_char\": 81\n",
+      "  },\n",
+      "  {\n",
+      "    \"id\": 20,\n",
+      "    \"text\": \"weeks\",\n",
+      "    \"lemma\": \"week\",\n",
+      "    \"upos\": \"NOUN\",\n",
+      "    \"xpos\": \"NNS\",\n",
+      "    \"feats\": \"Number=Plur\",\n",
+      "    \"head\": 4,\n",
+      "    \"deprel\": \"obl\",\n",
+      "    \"start_char\": 82,\n",
+      "    \"end_char\": 87,\n",
+      "    \"misc\": \"SpaceAfter=No\"\n",
+      "  },\n",
+      "  {\n",
+      "    \"id\": 21,\n",
+      "    \"text\": \".\",\n",
+      "    \"lemma\": \".\",\n",
+      "    \"upos\": \"PUNCT\",\n",
+      "    \"xpos\": \".\",\n",
+      "    \"head\": 4,\n",
+      "    \"deprel\": \"punct\",\n",
+      "    \"start_char\": 87,\n",
+      "    \"end_char\": 88\n",
+      "  }\n",
+      "], [\n",
+      "  {\n",
+      "    \"id\": 1,\n",
+      "    \"text\": \"It\",\n",
+      "    \"lemma\": \"it\",\n",
+      "    \"upos\": \"PRON\",\n",
+      "    \"xpos\": \"PRP\",\n",
+      "    \"feats\": \"Case=Nom|Gender=Neut|Number=Sing|Person=3|PronType=Prs\",\n",
+      "    \"head\": 3,\n",
+      "    \"deprel\": \"nsubj\",\n",
+      "    \"start_char\": 89,\n",
+      "    \"end_char\": 91\n",
+      "  },\n",
+      "  {\n",
+      "    \"id\": 2,\n",
+      "    \"text\": \"is\",\n",
+      "    \"lemma\": \"be\",\n",
+      "    \"upos\": \"AUX\",\n",
+      "    \"xpos\": \"VBZ\",\n",
+      "    \"feats\": \"Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin\",\n",
+      "    \"head\": 3,\n",
+      "    \"deprel\": \"cop\",\n",
+      "    \"start_char\": 92,\n",
+      "    \"end_char\": 94\n",
+      "  },\n",
+      "  {\n",
+      "    \"id\": 3,\n",
+      "    \"text\": \"red\",\n",
+      "    \"lemma\": \"red\",\n",
+      "    \"upos\": \"ADJ\",\n",
+      "    \"xpos\": \"JJ\",\n",
+      "    \"feats\": \"Degree=Pos\",\n",
+      "    \"head\": 0,\n",
+      "    \"deprel\": \"root\",\n",
+      "    \"start_char\": 95,\n",
+      "    \"end_char\": 98,\n",
+      "    \"misc\": \"SpaceAfter=No\"\n",
+      "  },\n",
+      "  {\n",
+      "    \"id\": 4,\n",
+      "    \"text\": \",\",\n",
+      "    \"lemma\": \",\",\n",
+      "    \"upos\": \"PUNCT\",\n",
+      "    \"xpos\": \",\",\n",
+      "    \"head\": 5,\n",
+      "    \"deprel\": \"punct\",\n",
+      "    \"start_char\": 98,\n",
+      "    \"end_char\": 99\n",
+      "  },\n",
+      "  {\n",
+      "    \"id\": 5,\n",
+      "    \"text\": \"itchy\",\n",
+      "    \"lemma\": \"itchy\",\n",
+      "    \"upos\": \"ADJ\",\n",
+      "    \"xpos\": \"JJ\",\n",
+      "    \"feats\": \"Degree=Pos\",\n",
+      "    \"head\": 3,\n",
+      "    \"deprel\": \"conj\",\n",
+      "    \"start_char\": 100,\n",
+      "    \"end_char\": 105,\n",
+      "    \"misc\": \"SpaceAfter=No\"\n",
+      "  },\n",
+      "  {\n",
+      "    \"id\": 6,\n",
+      "    \"text\": \",\",\n",
+      "    \"lemma\": \",\",\n",
+      "    \"upos\": \"PUNCT\",\n",
+      "    \"xpos\": \",\",\n",
+      "    \"head\": 8,\n",
+      "    \"deprel\": \"punct\",\n",
+      "    \"start_char\": 105,\n",
+      "    \"end_char\": 106\n",
+      "  },\n",
+      "  {\n",
+      "    \"id\": 7,\n",
+      "    \"text\": \"and\",\n",
+      "    \"lemma\": \"and\",\n",
+      "    \"upos\": \"CCONJ\",\n",
+      "    \"xpos\": \"CC\",\n",
+      "    \"head\": 8,\n",
+      "    \"deprel\": \"cc\",\n",
+      "    \"start_char\": 107,\n",
+      "    \"end_char\": 110\n",
+      "  },\n",
+      "  {\n",
+      "    \"id\": 8,\n",
+      "    \"text\": \"covered\",\n",
+      "    \"lemma\": \"cover\",\n",
+      "    \"upos\": \"VERB\",\n",
+      "    \"xpos\": \"VBN\",\n",
+      "    \"feats\": \"Tense=Past|VerbForm=Part\",\n",
+      "    \"head\": 3,\n",
+      "    \"deprel\": \"conj\",\n",
+      "    \"start_char\": 111,\n",
+      "    \"end_char\": 118\n",
+      "  },\n",
+      "  {\n",
+      "    \"id\": 9,\n",
+      "    \"text\": \"in\",\n",
+      "    \"lemma\": \"in\",\n",
+      "    \"upos\": \"ADP\",\n",
+      "    \"xpos\": \"IN\",\n",
+      "    \"head\": 13,\n",
+      "    \"deprel\": \"case\",\n",
+      "    \"start_char\": 119,\n",
+      "    \"end_char\": 121\n",
+      "  },\n",
+      "  {\n",
+      "    \"id\": 10,\n",
+      "    \"text\": \"dry\",\n",
+      "    \"lemma\": \"dry\",\n",
+      "    \"upos\": \"ADJ\",\n",
+      "    \"xpos\": \"JJ\",\n",
+      "    \"feats\": \"Degree=Pos\",\n",
+      "    \"head\": 13,\n",
+      "    \"deprel\": \"amod\",\n",
+      "    \"start_char\": 122,\n",
+      "    \"end_char\": 125,\n",
+      "    \"misc\": \"SpaceAfter=No\"\n",
+      "  },\n",
+      "  {\n",
+      "    \"id\": 11,\n",
+      "    \"text\": \",\",\n",
+      "    \"lemma\": \",\",\n",
+      "    \"upos\": \"PUNCT\",\n",
+      "    \"xpos\": \",\",\n",
+      "    \"head\": 10,\n",
+      "    \"deprel\": \"punct\",\n",
+      "    \"start_char\": 125,\n",
+      "    \"end_char\": 126\n",
+      "  },\n",
+      "  {\n",
+      "    \"id\": 12,\n",
+      "    \"text\": \"scaly\",\n",
+      "    \"lemma\": \"scaly\",\n",
+      "    \"upos\": \"ADJ\",\n",
+      "    \"xpos\": \"JJ\",\n",
+      "    \"feats\": \"Degree=Pos\",\n",
+      "    \"head\": 13,\n",
+      "    \"deprel\": \"amod\",\n",
+      "    \"start_char\": 127,\n",
+      "    \"end_char\": 132\n",
+      "  },\n",
+      "  {\n",
+      "    \"id\": 13,\n",
+      "    \"text\": \"patches\",\n",
+      "    \"lemma\": \"patch\",\n",
+      "    \"upos\": \"NOUN\",\n",
+      "    \"xpos\": \"NNS\",\n",
+      "    \"feats\": \"Number=Plur\",\n",
+      "    \"head\": 8,\n",
+      "    \"deprel\": \"obl\",\n",
+      "    \"start_char\": 133,\n",
+      "    \"end_char\": 140,\n",
+      "    \"misc\": \"SpaceAfter=No\"\n",
+      "  },\n",
+      "  {\n",
+      "    \"id\": 14,\n",
+      "    \"text\": \".\",\n",
+      "    \"lemma\": \".\",\n",
+      "    \"upos\": \"PUNCT\",\n",
+      "    \"xpos\": \".\",\n",
+      "    \"head\": 3,\n",
+      "    \"deprel\": \"punct\",\n",
+      "    \"start_char\": 140,\n",
+      "    \"end_char\": 141,\n",
+      "    \"misc\": \"SpaceAfter=No\"\n",
+      "  }\n",
+      "]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "with open(CONSTANTS.DEP_PARSED_TEXTS_OBJECT_PATH, 'rb') as f:\n",
+    "    loaded_data = pickle.load(f)\n",
+    "doc_0 = get_doc_by_id(0)\n",
+    "print(doc_0.text)\n",
+    "print(doc_0.sentences)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
@@ -573,9 +1045,7 @@
    }
   ],
   "source": [
-    "nlp = stanza.Pipeline(lang='en', processors='depparse', depparse_pretagged=True)\n",
-    "doc = nlp(doc_0)\n",
-    "print(*[f'id: {word.id}\\tword: {word.text}\\thead id: {word.head}\\thead: {sent.words[word.head-1].text if word.head > 0 else \"root\"}\\tdeprel: {word.deprel}' for sent in doc.sentences for word in sent.words], sep='\\n')"
+    "print(*[f'id: {word.id}\\tword: {word.text}\\thead id: {word.head}\\thead: {sent.words[word.head-1].text if word.head > 0 else \"root\"}\\tdeprel: {word.deprel}' for sent in doc_0.sentences for word in sent.words], sep='\\n') "
   ]
  }
 ],

--- a/constants.py
+++ b/constants.py
@@ -6,4 +6,5 @@ class CONSTANTS:
    AUGMENTED_TRAIN_SET_PATH = 'data/augmented_Preprocessed_Symptom2Disease_Train.csv'
    AUGMENTED_TEST_SET_PATH = 'data/augmented_Preprocessed_Symptom2Disease_Test.csv'

-    LEMMATIZED_TEXTS_OBJECT_PATH = 'stanza/lemmatized_texts.pkl'
\ No newline at end of file
+    LEMMATIZED_TEXTS_OBJECT_PATH = 'stanza/lemmatized_texts.pkl'
+    DEP_PARSED_TEXTS_OBJECT_PATH = 'stanza/dep_parsed_texts.pkl'
\ No newline at end of file
--- a/stanza/dep_parsed_texts.pkl
+++ b/stanza/dep_parsed_texts.pkl