Commit 0145d10f authored by Almouhannad Hafez's avatar Almouhannad Hafez

(5) Add initial stanza pipeline results

parent 80280f8d
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ***Setup***"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\TOSHIBA\\AppData\\Roaming\\Python\\Python39\\site-packages\\networkx\\utils\\backends.py:135: RuntimeWarning: networkx backend defined more than once: nx-loopback\n",
" backends.update(_get_backends(\"networkx.backends\"))\n"
]
}
],
"source": [
"import pandas as pd\n",
"\n",
"import stanza\n",
"\n",
"import pickle # Module to store python objects\n",
"\n",
"from constants import CONSTANTS"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ***Load dataset***"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(CONSTANTS.AUGMENTED_DATASET_PATH)\n",
"assert df.shape == (7200, 3), f\"Expected shape (7200, 3), but got {df.shape}\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ***Define stanza pipeline***"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma', verbose=False)\n",
" # mwt = multi word tokenization"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ***Process dataset using stanza pipeline***"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"texts = df[\"text\"].to_list()\n",
"ids = df[\"Id\"].to_list()\n",
"docs = []\n",
"for i in range(len(ids)):\n",
" dict = {'id': ids[i], 'processed_text': nlp(texts[i])}\n",
" docs.append(dict)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ***Store processed texts***"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"with open(CONSTANTS.LEMMATIZED_TEXTS_OBJECT_PATH, 'wb') as f:\n",
" pickle.dump(docs, f)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.20"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
...@@ -5,3 +5,5 @@ class CONSTANTS: ...@@ -5,3 +5,5 @@ class CONSTANTS:
AUGMENTED_DATASET_PATH = 'data/augmented_Symptom2Disease.csv' AUGMENTED_DATASET_PATH = 'data/augmented_Symptom2Disease.csv'
AUGMENTED_TRAIN_SET_PATH = 'data/augmented_Preprocessed_Symptom2Disease_Train.csv' AUGMENTED_TRAIN_SET_PATH = 'data/augmented_Preprocessed_Symptom2Disease_Train.csv'
AUGMENTED_TEST_SET_PATH = 'data/augmented_Preprocessed_Symptom2Disease_Test.csv' AUGMENTED_TEST_SET_PATH = 'data/augmented_Preprocessed_Symptom2Disease_Test.csv'
LEMMATIZED_TEXTS_OBJECT_PATH = 'stanza/lemmatized_texts.pkl'
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment