Commit 6593c212 authored by Almouhannad Hafez's avatar Almouhannad Hafez

(5) Add initial stanza pipeline results

parent 0145d10f
...@@ -102,6 +102,414 @@ ...@@ -102,6 +102,414 @@
"with open(CONSTANTS.LEMMATIZED_TEXTS_OBJECT_PATH, 'wb') as f:\n", "with open(CONSTANTS.LEMMATIZED_TEXTS_OBJECT_PATH, 'wb') as f:\n",
" pickle.dump(docs, f)" " pickle.dump(docs, f)"
] ]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"with open(CONSTANTS.LEMMATIZED_TEXTS_OBJECT_PATH, 'rb') as f:\n",
" loaded_data = pickle.load(f)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# Each sentence in the dataset has an id, and a document contain its stanza processing\n",
"def get_doc_by_id(target_id):\n",
" for obj in loaded_data:\n",
" if obj[\"id\"] == target_id:\n",
" return obj[\"processed_text\"]\n",
" return None # Return None if not found\n",
"\n",
"doc_0 = get_doc_by_id(0)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"I have been experiencing a skin rash on my arms, legs, and torso for the past few weeks. It is red, itchy, and covered in dry, scaly patches.\n"
]
}
],
"source": [
"print(doc_0.text)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[\n",
" {\n",
" \"id\": 1,\n",
" \"text\": \"I\",\n",
" \"lemma\": \"I\",\n",
" \"upos\": \"PRON\",\n",
" \"xpos\": \"PRP\",\n",
" \"feats\": \"Case=Nom|Number=Sing|Person=1|PronType=Prs\",\n",
" \"start_char\": 0,\n",
" \"end_char\": 1\n",
" },\n",
" {\n",
" \"id\": 2,\n",
" \"text\": \"have\",\n",
" \"lemma\": \"have\",\n",
" \"upos\": \"AUX\",\n",
" \"xpos\": \"VBP\",\n",
" \"feats\": \"Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin\",\n",
" \"start_char\": 2,\n",
" \"end_char\": 6\n",
" },\n",
" {\n",
" \"id\": 3,\n",
" \"text\": \"been\",\n",
" \"lemma\": \"be\",\n",
" \"upos\": \"AUX\",\n",
" \"xpos\": \"VBN\",\n",
" \"feats\": \"Tense=Past|VerbForm=Part\",\n",
" \"start_char\": 7,\n",
" \"end_char\": 11\n",
" },\n",
" {\n",
" \"id\": 4,\n",
" \"text\": \"experiencing\",\n",
" \"lemma\": \"experience\",\n",
" \"upos\": \"VERB\",\n",
" \"xpos\": \"VBG\",\n",
" \"feats\": \"Tense=Pres|VerbForm=Part\",\n",
" \"start_char\": 12,\n",
" \"end_char\": 24\n",
" },\n",
" {\n",
" \"id\": 5,\n",
" \"text\": \"a\",\n",
" \"lemma\": \"a\",\n",
" \"upos\": \"DET\",\n",
" \"xpos\": \"DT\",\n",
" \"feats\": \"Definite=Ind|PronType=Art\",\n",
" \"start_char\": 25,\n",
" \"end_char\": 26\n",
" },\n",
" {\n",
" \"id\": 6,\n",
" \"text\": \"skin\",\n",
" \"lemma\": \"skin\",\n",
" \"upos\": \"NOUN\",\n",
" \"xpos\": \"NN\",\n",
" \"feats\": \"Number=Sing\",\n",
" \"start_char\": 27,\n",
" \"end_char\": 31\n",
" },\n",
" {\n",
" \"id\": 7,\n",
" \"text\": \"rash\",\n",
" \"lemma\": \"rash\",\n",
" \"upos\": \"NOUN\",\n",
" \"xpos\": \"NN\",\n",
" \"feats\": \"Number=Sing\",\n",
" \"start_char\": 32,\n",
" \"end_char\": 36\n",
" },\n",
" {\n",
" \"id\": 8,\n",
" \"text\": \"on\",\n",
" \"lemma\": \"on\",\n",
" \"upos\": \"ADP\",\n",
" \"xpos\": \"IN\",\n",
" \"start_char\": 37,\n",
" \"end_char\": 39\n",
" },\n",
" {\n",
" \"id\": 9,\n",
" \"text\": \"my\",\n",
" \"lemma\": \"my\",\n",
" \"upos\": \"PRON\",\n",
" \"xpos\": \"PRP$\",\n",
" \"feats\": \"Case=Gen|Number=Sing|Person=1|Poss=Yes|PronType=Prs\",\n",
" \"start_char\": 40,\n",
" \"end_char\": 42\n",
" },\n",
" {\n",
" \"id\": 10,\n",
" \"text\": \"arms\",\n",
" \"lemma\": \"arm\",\n",
" \"upos\": \"NOUN\",\n",
" \"xpos\": \"NNS\",\n",
" \"feats\": \"Number=Plur\",\n",
" \"start_char\": 43,\n",
" \"end_char\": 47,\n",
" \"misc\": \"SpaceAfter=No\"\n",
" },\n",
" {\n",
" \"id\": 11,\n",
" \"text\": \",\",\n",
" \"lemma\": \",\",\n",
" \"upos\": \"PUNCT\",\n",
" \"xpos\": \",\",\n",
" \"start_char\": 47,\n",
" \"end_char\": 48\n",
" },\n",
" {\n",
" \"id\": 12,\n",
" \"text\": \"legs\",\n",
" \"lemma\": \"leg\",\n",
" \"upos\": \"NOUN\",\n",
" \"xpos\": \"NNS\",\n",
" \"feats\": \"Number=Plur\",\n",
" \"start_char\": 49,\n",
" \"end_char\": 53,\n",
" \"misc\": \"SpaceAfter=No\"\n",
" },\n",
" {\n",
" \"id\": 13,\n",
" \"text\": \",\",\n",
" \"lemma\": \",\",\n",
" \"upos\": \"PUNCT\",\n",
" \"xpos\": \",\",\n",
" \"start_char\": 53,\n",
" \"end_char\": 54\n",
" },\n",
" {\n",
" \"id\": 14,\n",
" \"text\": \"and\",\n",
" \"lemma\": \"and\",\n",
" \"upos\": \"CCONJ\",\n",
" \"xpos\": \"CC\",\n",
" \"start_char\": 55,\n",
" \"end_char\": 58\n",
" },\n",
" {\n",
" \"id\": 15,\n",
" \"text\": \"torso\",\n",
" \"lemma\": \"torso\",\n",
" \"upos\": \"NOUN\",\n",
" \"xpos\": \"NN\",\n",
" \"feats\": \"Number=Sing\",\n",
" \"start_char\": 59,\n",
" \"end_char\": 64\n",
" },\n",
" {\n",
" \"id\": 16,\n",
" \"text\": \"for\",\n",
" \"lemma\": \"for\",\n",
" \"upos\": \"ADP\",\n",
" \"xpos\": \"IN\",\n",
" \"start_char\": 65,\n",
" \"end_char\": 68\n",
" },\n",
" {\n",
" \"id\": 17,\n",
" \"text\": \"the\",\n",
" \"lemma\": \"the\",\n",
" \"upos\": \"DET\",\n",
" \"xpos\": \"DT\",\n",
" \"feats\": \"Definite=Def|PronType=Art\",\n",
" \"start_char\": 69,\n",
" \"end_char\": 72\n",
" },\n",
" {\n",
" \"id\": 18,\n",
" \"text\": \"past\",\n",
" \"lemma\": \"past\",\n",
" \"upos\": \"ADJ\",\n",
" \"xpos\": \"JJ\",\n",
" \"feats\": \"Degree=Pos\",\n",
" \"start_char\": 73,\n",
" \"end_char\": 77\n",
" },\n",
" {\n",
" \"id\": 19,\n",
" \"text\": \"few\",\n",
" \"lemma\": \"few\",\n",
" \"upos\": \"ADJ\",\n",
" \"xpos\": \"JJ\",\n",
" \"feats\": \"Degree=Pos\",\n",
" \"start_char\": 78,\n",
" \"end_char\": 81\n",
" },\n",
" {\n",
" \"id\": 20,\n",
" \"text\": \"weeks\",\n",
" \"lemma\": \"week\",\n",
" \"upos\": \"NOUN\",\n",
" \"xpos\": \"NNS\",\n",
" \"feats\": \"Number=Plur\",\n",
" \"start_char\": 82,\n",
" \"end_char\": 87,\n",
" \"misc\": \"SpaceAfter=No\"\n",
" },\n",
" {\n",
" \"id\": 21,\n",
" \"text\": \".\",\n",
" \"lemma\": \".\",\n",
" \"upos\": \"PUNCT\",\n",
" \"xpos\": \".\",\n",
" \"start_char\": 87,\n",
" \"end_char\": 88\n",
" }\n",
"], [\n",
" {\n",
" \"id\": 1,\n",
" \"text\": \"It\",\n",
" \"lemma\": \"it\",\n",
" \"upos\": \"PRON\",\n",
" \"xpos\": \"PRP\",\n",
" \"feats\": \"Case=Nom|Gender=Neut|Number=Sing|Person=3|PronType=Prs\",\n",
" \"start_char\": 89,\n",
" \"end_char\": 91\n",
" },\n",
" {\n",
" \"id\": 2,\n",
" \"text\": \"is\",\n",
" \"lemma\": \"be\",\n",
" \"upos\": \"AUX\",\n",
" \"xpos\": \"VBZ\",\n",
" \"feats\": \"Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin\",\n",
" \"start_char\": 92,\n",
" \"end_char\": 94\n",
" },\n",
" {\n",
" \"id\": 3,\n",
" \"text\": \"red\",\n",
" \"lemma\": \"red\",\n",
" \"upos\": \"ADJ\",\n",
" \"xpos\": \"JJ\",\n",
" \"feats\": \"Degree=Pos\",\n",
" \"start_char\": 95,\n",
" \"end_char\": 98,\n",
" \"misc\": \"SpaceAfter=No\"\n",
" },\n",
" {\n",
" \"id\": 4,\n",
" \"text\": \",\",\n",
" \"lemma\": \",\",\n",
" \"upos\": \"PUNCT\",\n",
" \"xpos\": \",\",\n",
" \"start_char\": 98,\n",
" \"end_char\": 99\n",
" },\n",
" {\n",
" \"id\": 5,\n",
" \"text\": \"itchy\",\n",
" \"lemma\": \"itchy\",\n",
" \"upos\": \"ADJ\",\n",
" \"xpos\": \"JJ\",\n",
" \"feats\": \"Degree=Pos\",\n",
" \"start_char\": 100,\n",
" \"end_char\": 105,\n",
" \"misc\": \"SpaceAfter=No\"\n",
" },\n",
" {\n",
" \"id\": 6,\n",
" \"text\": \",\",\n",
" \"lemma\": \",\",\n",
" \"upos\": \"PUNCT\",\n",
" \"xpos\": \",\",\n",
" \"start_char\": 105,\n",
" \"end_char\": 106\n",
" },\n",
" {\n",
" \"id\": 7,\n",
" \"text\": \"and\",\n",
" \"lemma\": \"and\",\n",
" \"upos\": \"CCONJ\",\n",
" \"xpos\": \"CC\",\n",
" \"start_char\": 107,\n",
" \"end_char\": 110\n",
" },\n",
" {\n",
" \"id\": 8,\n",
" \"text\": \"covered\",\n",
" \"lemma\": \"cover\",\n",
" \"upos\": \"VERB\",\n",
" \"xpos\": \"VBN\",\n",
" \"feats\": \"Tense=Past|VerbForm=Part\",\n",
" \"start_char\": 111,\n",
" \"end_char\": 118\n",
" },\n",
" {\n",
" \"id\": 9,\n",
" \"text\": \"in\",\n",
" \"lemma\": \"in\",\n",
" \"upos\": \"ADP\",\n",
" \"xpos\": \"IN\",\n",
" \"start_char\": 119,\n",
" \"end_char\": 121\n",
" },\n",
" {\n",
" \"id\": 10,\n",
" \"text\": \"dry\",\n",
" \"lemma\": \"dry\",\n",
" \"upos\": \"ADJ\",\n",
" \"xpos\": \"JJ\",\n",
" \"feats\": \"Degree=Pos\",\n",
" \"start_char\": 122,\n",
" \"end_char\": 125,\n",
" \"misc\": \"SpaceAfter=No\"\n",
" },\n",
" {\n",
" \"id\": 11,\n",
" \"text\": \",\",\n",
" \"lemma\": \",\",\n",
" \"upos\": \"PUNCT\",\n",
" \"xpos\": \",\",\n",
" \"start_char\": 125,\n",
" \"end_char\": 126\n",
" },\n",
" {\n",
" \"id\": 12,\n",
" \"text\": \"scaly\",\n",
" \"lemma\": \"scaly\",\n",
" \"upos\": \"ADJ\",\n",
" \"xpos\": \"JJ\",\n",
" \"feats\": \"Degree=Pos\",\n",
" \"start_char\": 127,\n",
" \"end_char\": 132\n",
" },\n",
" {\n",
" \"id\": 13,\n",
" \"text\": \"patches\",\n",
" \"lemma\": \"patch\",\n",
" \"upos\": \"NOUN\",\n",
" \"xpos\": \"NNS\",\n",
" \"feats\": \"Number=Plur\",\n",
" \"start_char\": 133,\n",
" \"end_char\": 140,\n",
" \"misc\": \"SpaceAfter=No\"\n",
" },\n",
" {\n",
" \"id\": 14,\n",
" \"text\": \".\",\n",
" \"lemma\": \".\",\n",
" \"upos\": \"PUNCT\",\n",
" \"xpos\": \".\",\n",
" \"start_char\": 140,\n",
" \"end_char\": 141,\n",
" \"misc\": \"SpaceAfter=No\"\n",
" }\n",
"]]\n"
]
}
],
"source": [
"print(doc_0.sentences)"
]
} }
], ],
"metadata": { "metadata": {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment