Commit 8917cc79 authored by Almouhannad Hafez's avatar Almouhannad Hafez

(0) Add dataset overview

parent 673dc40c
This diff is collapsed.
......@@ -13,7 +13,6 @@
"metadata": {},
"outputs": [],
"source": [
"import nltk\n",
"from nltk.stem import PorterStemmer\n",
"from nltk.tokenize import word_tokenize\n",
"from nltk.corpus import stopwords\n",
......@@ -29,58 +28,6 @@
"from constants import CONSTANTS"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Now, we have to download `Punkt Tokenizer Model`, try running following cell, if it didn't work successfully then try to download model manually from following links: [Manual installation](https://www.nltk.org/data.html), and [Model link](https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip)."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Uncomment if you haven't already**"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# nltk.download('punkt')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# nltk.download('stopwords')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# nltk.download('punkt_tab')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### You must see an output similar to the following output:\n",
"> `[nltk_data] Downloading package punkt to` \n",
"> `[nltk_data] ...\\AppData\\Roaming\\nltk_data...` \n",
"> `[nltk_data] Package punkt is already up-to-date!` \n",
"> `True`"
]
},
{
"cell_type": "markdown",
"metadata": {},
......
......@@ -42,33 +42,6 @@
"**Uncomment if you haven't already**"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# nltk.download('wordnet')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# nltk.download('averaged_perceptron_tagger')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# nltk.download('averaged_perceptron_tagger_eng')"
]
},
{
"cell_type": "code",
"execution_count": 5,
......
This diff is collapsed.
name: NLP
channels:
- conda-forge
- defaults
dependencies:
- annotated-types=0.6.0=py39haa95532_0
......@@ -15,10 +16,11 @@ dependencies:
- blas=1.0=mkl
- bleach=4.1.0=pyhd3eb1b0_0
- bottleneck=1.3.7=py39h9128911_0
- brotli=1.0.9=ha925a31_2
- brotli-python=1.0.9=py39hd77b12b_8
- ca-certificates=2024.9.24=haa95532_0
- ca-certificates=2024.8.30=h56e8100_0
- catalogue=2.0.10=py39haa95532_0
- certifi=2024.8.30=py39haa95532_0
- certifi=2024.8.30=pyhd8ed1ab_0
- cffi=1.17.1=py39h827c3e9_0
- charset-normalizer=3.3.2=pyhd3eb1b0_0
- click=8.1.7=py39haa95532_0
......@@ -26,6 +28,8 @@ dependencies:
- colorama=0.4.6=py39haa95532_0
- comm=0.2.1=py39haa95532_0
- confection=0.1.4=py39h9909e9c_0
- contourpy=1.2.0=py39h59b6b97_0
- cycler=0.12.1=pyhd8ed1ab_0
- cymem=2.0.6=py39hd77b12b_0
- cython-blis=0.7.9=py39h080aedc_0
- debugpy=1.6.7=py39hd77b12b_0
......@@ -33,6 +37,8 @@ dependencies:
- defusedxml=0.7.1=pyhd3eb1b0_0
- exceptiongroup=1.2.0=py39haa95532_0
- executing=0.8.3=pyhd3eb1b0_0
- fonttools=4.25.0=pyhd3eb1b0_0
- freetype=2.10.4=h546665d_1
- h11=0.14.0=py39haa95532_0
- httpcore=1.0.2=py39haa95532_0
- httpx=0.27.0=py39haa95532_0
......@@ -40,6 +46,7 @@ dependencies:
- idna=3.7=py39haa95532_0
- importlib-metadata=7.0.1=py39haa95532_0
- importlib_metadata=7.0.1=hd3eb1b0_0
- importlib_resources=6.4.5=pyhd8ed1ab_0
- intel-openmp=2023.1.0=h59b6b97_46320
- ipykernel=6.29.5=py39haa95532_0
- ipython=8.15.0=py39haa95532_0
......@@ -63,16 +70,23 @@ dependencies:
- jupyterlab_pygments=0.1.2=py_0
- jupyterlab_server=2.27.3=py39haa95532_0
- jupyterlab_widgets=3.0.10=py39haa95532_0
- kiwisolver=1.4.4=py39hd77b12b_0
- krb5=1.20.1=h5b6d351_0
- langcodes=3.3.0=pyhd3eb1b0_0
- lcms2=2.12=h83e58a3_0
- lerc=3.0=hd77b12b_0
- libclang=14.0.6=default_hb5a9fac_1
- libclang13=14.0.6=default_h8e68704_1
- libdeflate=1.17=h2bbff1b_1
- libpng=1.6.39=h8cc25b3_0
- libpq=12.17=h906ac69_0
- libsodium=1.0.18=h62dcd97_0
- libtiff=4.5.1=hd77b12b_0
- libwebp-base=1.3.2=h3d04722_1
- lz4-c=1.9.4=h2bbff1b_1
- markdown-it-py=2.2.0=py39haa95532_1
- markupsafe=2.1.3=py39h2bbff1b_0
- matplotlib-base=3.9.2=py39he19b0ae_0
- matplotlib-inline=0.1.6=py39haa95532_0
- mdurl=0.1.0=py39haa95532_0
- mistune=2.0.4=py39haa95532_0
......@@ -80,6 +94,7 @@ dependencies:
- mkl-service=2.4.0=py39h2bbff1b_1
- mkl_fft=1.3.10=py39h827c3e9_0
- mkl_random=1.2.7=py39hc64d2fc_0
- munkres=1.1.4=pyh9f0ad1d_0
- murmurhash=1.0.7=py39hd77b12b_0
- nbclient=0.8.0=py39haa95532_0
- nbconvert=7.16.4=py39haa95532_0
......@@ -91,6 +106,7 @@ dependencies:
- numexpr=2.10.1=py39h4cd664f_0
- numpy=1.26.4=py39h055cbcc_0
- numpy-base=1.26.4=py39h65a83cf_0
- openjpeg=2.5.2=hae555c5_0
- openssl=3.0.15=h827c3e9_0
- overrides=7.4.0=py39haa95532_0
- packaging=24.1=py39haa95532_0
......@@ -98,6 +114,8 @@ dependencies:
- pandocfilters=1.5.0=pyhd3eb1b0_0
- parso=0.8.3=pyhd3eb1b0_0
- pickleshare=0.7.5=pyhd3eb1b0_1003
- pillow=10.4.0=py39h827c3e9_0
- pip=24.3.1=pyh8b19718_0
- platformdirs=3.10.0=py39haa95532_0
- ply=3.11=py39haa95532_0
- preshed=3.0.6=py39h6c2663c_0
......@@ -110,6 +128,7 @@ dependencies:
- pydantic=2.8.2=py39haa95532_0
- pydantic-core=2.20.1=py39hefb1915_0
- pygments=2.15.1=py39haa95532_1
- pyparsing=3.2.0=pyhd8ed1ab_1
- pyqt=5.15.10=py39hd77b12b_0
- pyqt5-sip=12.13.0=py39h2bbff1b_0
- pysocks=1.7.1=py39haa95532_0
......@@ -172,6 +191,7 @@ dependencies:
- widgetsnbextension=4.0.10=py39haa95532_0
- win_inet_pton=1.1.0=py39haa95532_0
- winpty=0.4.3=4
- wordcloud=1.9.3=py39h2bbff1b_0
- xz=5.4.6=h8cc25b3_1
- yaml=0.2.5=he774522_0
- zeromq=4.3.5=hd77b12b_0
......@@ -184,7 +204,6 @@ dependencies:
- emoji==2.14.0
- gensim==4.3.3
- huggingface-hub==0.26.2
- pip==24.3.1
- pyahocorasick==2.1.0
- safetensors==0.4.5
- scikit-learn==1.5.2
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment