Commit ddcda0d8 authored by Almouhannad Hafez's avatar Almouhannad Hafez

(5) Add dep. parsing

parent 50aa3ab7
......@@ -104,20 +104,20 @@
]
},
{
"cell_type": "code",
"execution_count": 6,
"cell_type": "markdown",
"metadata": {},
"outputs": [],
"source": [
"with open(CONSTANTS.LEMMATIZED_TEXTS_OBJECT_PATH, 'rb') as f:\n",
" loaded_data = pickle.load(f)"
"# ***Load processed texts***\n"
]
},
{
"cell_type": "markdown",
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# ***Load processed texts***"
"with open(CONSTANTS.LEMMATIZED_TEXTS_OBJECT_PATH, 'rb') as f:\n",
" loaded_data = pickle.load(f)"
]
},
{
......@@ -522,13 +522,485 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"# ***Apply new pipeline step***"
"# ***Apply new pipeline step (Dep. parsing)***"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"nlp = stanza.Pipeline(lang='en', processors='depparse', depparse_pretagged=True)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"ids = df[\"Id\"].to_list()\n",
"docs = []\n",
"for i in range(len(ids)):\n",
" dict = {'id': ids[i], 'processed_text': nlp(get_doc_by_id(ids[i]))}\n",
" docs.append(dict)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"with open(CONSTANTS.DEP_PARSED_TEXTS_OBJECT_PATH, 'wb') as f:\n",
" pickle.dump(docs, f)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"I have been experiencing a skin rash on my arms, legs, and torso for the past few weeks. It is red, itchy, and covered in dry, scaly patches.\n",
"[[\n",
" {\n",
" \"id\": 1,\n",
" \"text\": \"I\",\n",
" \"lemma\": \"I\",\n",
" \"upos\": \"PRON\",\n",
" \"xpos\": \"PRP\",\n",
" \"feats\": \"Case=Nom|Number=Sing|Person=1|PronType=Prs\",\n",
" \"head\": 4,\n",
" \"deprel\": \"nsubj\",\n",
" \"start_char\": 0,\n",
" \"end_char\": 1\n",
" },\n",
" {\n",
" \"id\": 2,\n",
" \"text\": \"have\",\n",
" \"lemma\": \"have\",\n",
" \"upos\": \"AUX\",\n",
" \"xpos\": \"VBP\",\n",
" \"feats\": \"Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin\",\n",
" \"head\": 4,\n",
" \"deprel\": \"aux\",\n",
" \"start_char\": 2,\n",
" \"end_char\": 6\n",
" },\n",
" {\n",
" \"id\": 3,\n",
" \"text\": \"been\",\n",
" \"lemma\": \"be\",\n",
" \"upos\": \"AUX\",\n",
" \"xpos\": \"VBN\",\n",
" \"feats\": \"Tense=Past|VerbForm=Part\",\n",
" \"head\": 4,\n",
" \"deprel\": \"aux\",\n",
" \"start_char\": 7,\n",
" \"end_char\": 11\n",
" },\n",
" {\n",
" \"id\": 4,\n",
" \"text\": \"experiencing\",\n",
" \"lemma\": \"experience\",\n",
" \"upos\": \"VERB\",\n",
" \"xpos\": \"VBG\",\n",
" \"feats\": \"Tense=Pres|VerbForm=Part\",\n",
" \"head\": 0,\n",
" \"deprel\": \"root\",\n",
" \"start_char\": 12,\n",
" \"end_char\": 24\n",
" },\n",
" {\n",
" \"id\": 5,\n",
" \"text\": \"a\",\n",
" \"lemma\": \"a\",\n",
" \"upos\": \"DET\",\n",
" \"xpos\": \"DT\",\n",
" \"feats\": \"Definite=Ind|PronType=Art\",\n",
" \"head\": 7,\n",
" \"deprel\": \"det\",\n",
" \"start_char\": 25,\n",
" \"end_char\": 26\n",
" },\n",
" {\n",
" \"id\": 6,\n",
" \"text\": \"skin\",\n",
" \"lemma\": \"skin\",\n",
" \"upos\": \"NOUN\",\n",
" \"xpos\": \"NN\",\n",
" \"feats\": \"Number=Sing\",\n",
" \"head\": 7,\n",
" \"deprel\": \"compound\",\n",
" \"start_char\": 27,\n",
" \"end_char\": 31\n",
" },\n",
" {\n",
" \"id\": 7,\n",
" \"text\": \"rash\",\n",
" \"lemma\": \"rash\",\n",
" \"upos\": \"NOUN\",\n",
" \"xpos\": \"NN\",\n",
" \"feats\": \"Number=Sing\",\n",
" \"head\": 4,\n",
" \"deprel\": \"obj\",\n",
" \"start_char\": 32,\n",
" \"end_char\": 36\n",
" },\n",
" {\n",
" \"id\": 8,\n",
" \"text\": \"on\",\n",
" \"lemma\": \"on\",\n",
" \"upos\": \"ADP\",\n",
" \"xpos\": \"IN\",\n",
" \"head\": 10,\n",
" \"deprel\": \"case\",\n",
" \"start_char\": 37,\n",
" \"end_char\": 39\n",
" },\n",
" {\n",
" \"id\": 9,\n",
" \"text\": \"my\",\n",
" \"lemma\": \"my\",\n",
" \"upos\": \"PRON\",\n",
" \"xpos\": \"PRP$\",\n",
" \"feats\": \"Case=Gen|Number=Sing|Person=1|Poss=Yes|PronType=Prs\",\n",
" \"head\": 10,\n",
" \"deprel\": \"nmod:poss\",\n",
" \"start_char\": 40,\n",
" \"end_char\": 42\n",
" },\n",
" {\n",
" \"id\": 10,\n",
" \"text\": \"arms\",\n",
" \"lemma\": \"arm\",\n",
" \"upos\": \"NOUN\",\n",
" \"xpos\": \"NNS\",\n",
" \"feats\": \"Number=Plur\",\n",
" \"head\": 7,\n",
" \"deprel\": \"nmod\",\n",
" \"start_char\": 43,\n",
" \"end_char\": 47,\n",
" \"misc\": \"SpaceAfter=No\"\n",
" },\n",
" {\n",
" \"id\": 11,\n",
" \"text\": \",\",\n",
" \"lemma\": \",\",\n",
" \"upos\": \"PUNCT\",\n",
" \"xpos\": \",\",\n",
" \"head\": 12,\n",
" \"deprel\": \"punct\",\n",
" \"start_char\": 47,\n",
" \"end_char\": 48\n",
" },\n",
" {\n",
" \"id\": 12,\n",
" \"text\": \"legs\",\n",
" \"lemma\": \"leg\",\n",
" \"upos\": \"NOUN\",\n",
" \"xpos\": \"NNS\",\n",
" \"feats\": \"Number=Plur\",\n",
" \"head\": 10,\n",
" \"deprel\": \"conj\",\n",
" \"start_char\": 49,\n",
" \"end_char\": 53,\n",
" \"misc\": \"SpaceAfter=No\"\n",
" },\n",
" {\n",
" \"id\": 13,\n",
" \"text\": \",\",\n",
" \"lemma\": \",\",\n",
" \"upos\": \"PUNCT\",\n",
" \"xpos\": \",\",\n",
" \"head\": 15,\n",
" \"deprel\": \"punct\",\n",
" \"start_char\": 53,\n",
" \"end_char\": 54\n",
" },\n",
" {\n",
" \"id\": 14,\n",
" \"text\": \"and\",\n",
" \"lemma\": \"and\",\n",
" \"upos\": \"CCONJ\",\n",
" \"xpos\": \"CC\",\n",
" \"head\": 15,\n",
" \"deprel\": \"cc\",\n",
" \"start_char\": 55,\n",
" \"end_char\": 58\n",
" },\n",
" {\n",
" \"id\": 15,\n",
" \"text\": \"torso\",\n",
" \"lemma\": \"torso\",\n",
" \"upos\": \"NOUN\",\n",
" \"xpos\": \"NN\",\n",
" \"feats\": \"Number=Sing\",\n",
" \"head\": 10,\n",
" \"deprel\": \"conj\",\n",
" \"start_char\": 59,\n",
" \"end_char\": 64\n",
" },\n",
" {\n",
" \"id\": 16,\n",
" \"text\": \"for\",\n",
" \"lemma\": \"for\",\n",
" \"upos\": \"ADP\",\n",
" \"xpos\": \"IN\",\n",
" \"head\": 20,\n",
" \"deprel\": \"case\",\n",
" \"start_char\": 65,\n",
" \"end_char\": 68\n",
" },\n",
" {\n",
" \"id\": 17,\n",
" \"text\": \"the\",\n",
" \"lemma\": \"the\",\n",
" \"upos\": \"DET\",\n",
" \"xpos\": \"DT\",\n",
" \"feats\": \"Definite=Def|PronType=Art\",\n",
" \"head\": 20,\n",
" \"deprel\": \"det\",\n",
" \"start_char\": 69,\n",
" \"end_char\": 72\n",
" },\n",
" {\n",
" \"id\": 18,\n",
" \"text\": \"past\",\n",
" \"lemma\": \"past\",\n",
" \"upos\": \"ADJ\",\n",
" \"xpos\": \"JJ\",\n",
" \"feats\": \"Degree=Pos\",\n",
" \"head\": 20,\n",
" \"deprel\": \"amod\",\n",
" \"start_char\": 73,\n",
" \"end_char\": 77\n",
" },\n",
" {\n",
" \"id\": 19,\n",
" \"text\": \"few\",\n",
" \"lemma\": \"few\",\n",
" \"upos\": \"ADJ\",\n",
" \"xpos\": \"JJ\",\n",
" \"feats\": \"Degree=Pos\",\n",
" \"head\": 20,\n",
" \"deprel\": \"amod\",\n",
" \"start_char\": 78,\n",
" \"end_char\": 81\n",
" },\n",
" {\n",
" \"id\": 20,\n",
" \"text\": \"weeks\",\n",
" \"lemma\": \"week\",\n",
" \"upos\": \"NOUN\",\n",
" \"xpos\": \"NNS\",\n",
" \"feats\": \"Number=Plur\",\n",
" \"head\": 4,\n",
" \"deprel\": \"obl\",\n",
" \"start_char\": 82,\n",
" \"end_char\": 87,\n",
" \"misc\": \"SpaceAfter=No\"\n",
" },\n",
" {\n",
" \"id\": 21,\n",
" \"text\": \".\",\n",
" \"lemma\": \".\",\n",
" \"upos\": \"PUNCT\",\n",
" \"xpos\": \".\",\n",
" \"head\": 4,\n",
" \"deprel\": \"punct\",\n",
" \"start_char\": 87,\n",
" \"end_char\": 88\n",
" }\n",
"], [\n",
" {\n",
" \"id\": 1,\n",
" \"text\": \"It\",\n",
" \"lemma\": \"it\",\n",
" \"upos\": \"PRON\",\n",
" \"xpos\": \"PRP\",\n",
" \"feats\": \"Case=Nom|Gender=Neut|Number=Sing|Person=3|PronType=Prs\",\n",
" \"head\": 3,\n",
" \"deprel\": \"nsubj\",\n",
" \"start_char\": 89,\n",
" \"end_char\": 91\n",
" },\n",
" {\n",
" \"id\": 2,\n",
" \"text\": \"is\",\n",
" \"lemma\": \"be\",\n",
" \"upos\": \"AUX\",\n",
" \"xpos\": \"VBZ\",\n",
" \"feats\": \"Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin\",\n",
" \"head\": 3,\n",
" \"deprel\": \"cop\",\n",
" \"start_char\": 92,\n",
" \"end_char\": 94\n",
" },\n",
" {\n",
" \"id\": 3,\n",
" \"text\": \"red\",\n",
" \"lemma\": \"red\",\n",
" \"upos\": \"ADJ\",\n",
" \"xpos\": \"JJ\",\n",
" \"feats\": \"Degree=Pos\",\n",
" \"head\": 0,\n",
" \"deprel\": \"root\",\n",
" \"start_char\": 95,\n",
" \"end_char\": 98,\n",
" \"misc\": \"SpaceAfter=No\"\n",
" },\n",
" {\n",
" \"id\": 4,\n",
" \"text\": \",\",\n",
" \"lemma\": \",\",\n",
" \"upos\": \"PUNCT\",\n",
" \"xpos\": \",\",\n",
" \"head\": 5,\n",
" \"deprel\": \"punct\",\n",
" \"start_char\": 98,\n",
" \"end_char\": 99\n",
" },\n",
" {\n",
" \"id\": 5,\n",
" \"text\": \"itchy\",\n",
" \"lemma\": \"itchy\",\n",
" \"upos\": \"ADJ\",\n",
" \"xpos\": \"JJ\",\n",
" \"feats\": \"Degree=Pos\",\n",
" \"head\": 3,\n",
" \"deprel\": \"conj\",\n",
" \"start_char\": 100,\n",
" \"end_char\": 105,\n",
" \"misc\": \"SpaceAfter=No\"\n",
" },\n",
" {\n",
" \"id\": 6,\n",
" \"text\": \",\",\n",
" \"lemma\": \",\",\n",
" \"upos\": \"PUNCT\",\n",
" \"xpos\": \",\",\n",
" \"head\": 8,\n",
" \"deprel\": \"punct\",\n",
" \"start_char\": 105,\n",
" \"end_char\": 106\n",
" },\n",
" {\n",
" \"id\": 7,\n",
" \"text\": \"and\",\n",
" \"lemma\": \"and\",\n",
" \"upos\": \"CCONJ\",\n",
" \"xpos\": \"CC\",\n",
" \"head\": 8,\n",
" \"deprel\": \"cc\",\n",
" \"start_char\": 107,\n",
" \"end_char\": 110\n",
" },\n",
" {\n",
" \"id\": 8,\n",
" \"text\": \"covered\",\n",
" \"lemma\": \"cover\",\n",
" \"upos\": \"VERB\",\n",
" \"xpos\": \"VBN\",\n",
" \"feats\": \"Tense=Past|VerbForm=Part\",\n",
" \"head\": 3,\n",
" \"deprel\": \"conj\",\n",
" \"start_char\": 111,\n",
" \"end_char\": 118\n",
" },\n",
" {\n",
" \"id\": 9,\n",
" \"text\": \"in\",\n",
" \"lemma\": \"in\",\n",
" \"upos\": \"ADP\",\n",
" \"xpos\": \"IN\",\n",
" \"head\": 13,\n",
" \"deprel\": \"case\",\n",
" \"start_char\": 119,\n",
" \"end_char\": 121\n",
" },\n",
" {\n",
" \"id\": 10,\n",
" \"text\": \"dry\",\n",
" \"lemma\": \"dry\",\n",
" \"upos\": \"ADJ\",\n",
" \"xpos\": \"JJ\",\n",
" \"feats\": \"Degree=Pos\",\n",
" \"head\": 13,\n",
" \"deprel\": \"amod\",\n",
" \"start_char\": 122,\n",
" \"end_char\": 125,\n",
" \"misc\": \"SpaceAfter=No\"\n",
" },\n",
" {\n",
" \"id\": 11,\n",
" \"text\": \",\",\n",
" \"lemma\": \",\",\n",
" \"upos\": \"PUNCT\",\n",
" \"xpos\": \",\",\n",
" \"head\": 10,\n",
" \"deprel\": \"punct\",\n",
" \"start_char\": 125,\n",
" \"end_char\": 126\n",
" },\n",
" {\n",
" \"id\": 12,\n",
" \"text\": \"scaly\",\n",
" \"lemma\": \"scaly\",\n",
" \"upos\": \"ADJ\",\n",
" \"xpos\": \"JJ\",\n",
" \"feats\": \"Degree=Pos\",\n",
" \"head\": 13,\n",
" \"deprel\": \"amod\",\n",
" \"start_char\": 127,\n",
" \"end_char\": 132\n",
" },\n",
" {\n",
" \"id\": 13,\n",
" \"text\": \"patches\",\n",
" \"lemma\": \"patch\",\n",
" \"upos\": \"NOUN\",\n",
" \"xpos\": \"NNS\",\n",
" \"feats\": \"Number=Plur\",\n",
" \"head\": 8,\n",
" \"deprel\": \"obl\",\n",
" \"start_char\": 133,\n",
" \"end_char\": 140,\n",
" \"misc\": \"SpaceAfter=No\"\n",
" },\n",
" {\n",
" \"id\": 14,\n",
" \"text\": \".\",\n",
" \"lemma\": \".\",\n",
" \"upos\": \"PUNCT\",\n",
" \"xpos\": \".\",\n",
" \"head\": 3,\n",
" \"deprel\": \"punct\",\n",
" \"start_char\": 140,\n",
" \"end_char\": 141,\n",
" \"misc\": \"SpaceAfter=No\"\n",
" }\n",
"]]\n"
]
}
],
"source": [
"with open(CONSTANTS.DEP_PARSED_TEXTS_OBJECT_PATH, 'rb') as f:\n",
" loaded_data = pickle.load(f)\n",
"doc_0 = get_doc_by_id(0)\n",
"print(doc_0.text)\n",
"print(doc_0.sentences)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
......@@ -573,9 +1045,7 @@
}
],
"source": [
"nlp = stanza.Pipeline(lang='en', processors='depparse', depparse_pretagged=True)\n",
"doc = nlp(doc_0)\n",
"print(*[f'id: {word.id}\\tword: {word.text}\\thead id: {word.head}\\thead: {sent.words[word.head-1].text if word.head > 0 else \"root\"}\\tdeprel: {word.deprel}' for sent in doc.sentences for word in sent.words], sep='\\n')"
"print(*[f'id: {word.id}\\tword: {word.text}\\thead id: {word.head}\\thead: {sent.words[word.head-1].text if word.head > 0 else \"root\"}\\tdeprel: {word.deprel}' for sent in doc_0.sentences for word in sent.words], sep='\\n') "
]
}
],
......
......@@ -6,4 +6,5 @@ class CONSTANTS:
AUGMENTED_TRAIN_SET_PATH = 'data/augmented_Preprocessed_Symptom2Disease_Train.csv'
AUGMENTED_TEST_SET_PATH = 'data/augmented_Preprocessed_Symptom2Disease_Test.csv'
LEMMATIZED_TEXTS_OBJECT_PATH = 'stanza/lemmatized_texts.pkl'
\ No newline at end of file
LEMMATIZED_TEXTS_OBJECT_PATH = 'stanza/lemmatized_texts.pkl'
DEP_PARSED_TEXTS_OBJECT_PATH = 'stanza/dep_parsed_texts.pkl'
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment