Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Sign in
Toggle navigation
D
DM-Project
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
almohanad.hafez
DM-Project
Commits
91bfa09b
Commit
91bfa09b
authored
Oct 29, 2024
by
Almouhannad
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Add algorithms
parent
fb8fcf89
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
658 additions
and
17 deletions
+658
-17
.gitignore
.gitignore
+1
-0
constants.py
constants.py
+3
-1
hw1.ipynb
hw1.ipynb
+654
-16
No files found.
.gitignore
View file @
91bfa09b
__pycache__/constants.cpython-311.pyc
data/bread_basket_preprocessed.csv
constants.py
View file @
91bfa09b
...
...
@@ -2,4 +2,6 @@ class CONSTANTS:
DATASET_PATH
=
'data/bread_basket.csv'
DATASET_SHAPE
=
(
20507
,
5
)
PREPROCESSED_DATASET_PATH
=
'data/bread_basket_preprocessed.csv'
PREPROCESSED_DATASET_SHAPE
=
(
9465
,
94
)
\ No newline at end of file
PREPROCESSED_DATASET_SHAPE
=
(
9465
,
94
)
MIN_SUPPORT_VALUE
=
0.2
# Adjust this value based on problem
MIN_CONFIDENCE_VALUE
=
0.5
# Adjust this value based on problem
\ No newline at end of file
hw1.ipynb
View file @
91bfa09b
...
...
@@ -33,9 +33,21 @@
"import pandas as pd\n",
"\n",
"from mlxtend.frequent_patterns import apriori, association_rules\n",
"from mlxtend.frequent_patterns import fpgrowth\n",
"\n",
"from constants import CONSTANTS"
"from mlxtend.frequent_patterns import fpgrowth"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"from constants import CONSTANTS\n",
"# Some more magic so that the notebook will reload external python modules;\n",
"# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython\n",
"%load_ext autoreload\n",
"%autoreload 2\n",
"%reload_ext autoreload"
]
},
{
...
...
@@ -54,7 +66,7 @@
},
{
"cell_type": "code",
"execution_count":
3
,
"execution_count":
4
,
"metadata": {},
"outputs": [
{
...
...
@@ -81,7 +93,7 @@
},
{
"cell_type": "code",
"execution_count":
4
,
"execution_count":
5
,
"metadata": {},
"outputs": [
{
...
...
@@ -119,7 +131,7 @@
},
{
"cell_type": "code",
"execution_count":
5
,
"execution_count":
6
,
"metadata": {},
"outputs": [
{
...
...
@@ -143,7 +155,7 @@
},
{
"cell_type": "code",
"execution_count":
6
,
"execution_count":
7
,
"metadata": {},
"outputs": [
{
...
...
@@ -170,7 +182,7 @@
},
{
"cell_type": "code",
"execution_count":
7
,
"execution_count":
8
,
"metadata": {},
"outputs": [
{
...
...
@@ -196,7 +208,7 @@
},
{
"cell_type": "code",
"execution_count":
8
,
"execution_count":
9
,
"metadata": {},
"outputs": [
{
...
...
@@ -236,7 +248,7 @@
},
{
"cell_type": "code",
"execution_count":
9
,
"execution_count":
10
,
"metadata": {},
"outputs": [
{
...
...
@@ -281,7 +293,7 @@
},
{
"cell_type": "code",
"execution_count": 1
0
,
"execution_count": 1
1
,
"metadata": {},
"outputs": [
{
...
...
@@ -355,7 +367,7 @@
"5 3 Cookies"
]
},
"execution_count": 1
0
,
"execution_count": 1
1
,
"metadata": {},
"output_type": "execute_result"
}
...
...
@@ -367,7 +379,7 @@
},
{
"cell_type": "code",
"execution_count": 1
1
,
"execution_count": 1
2
,
"metadata": {},
"outputs": [
{
...
...
@@ -394,7 +406,7 @@
},
{
"cell_type": "code",
"execution_count": 1
2
,
"execution_count": 1
3
,
"metadata": {},
"outputs": [
{
...
...
@@ -436,11 +448,13 @@
},
{
"cell_type": "code",
"execution_count": 1
3
,
"execution_count": 1
4
,
"metadata": {},
"outputs": [],
"source": [
"df.to_csv(CONSTANTS.PREPROCESSED_DATASET_PATH)"
"df.to_csv(CONSTANTS.PREPROCESSED_DATASET_PATH, index=False)\n",
"# Free memory\n",
"df = None"
]
},
{
...
...
@@ -450,6 +464,318 @@
"# ***2. Extracting rules using Apriori***"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ***2.1. Load dataset***"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dataset loaded successfully\n"
]
}
],
"source": [
"df = None\n",
"df = pd.read_csv(CONSTANTS.PREPROCESSED_DATASET_PATH)\n",
"assert df.shape == CONSTANTS.PREPROCESSED_DATASET_SHAPE, f\"Expected shape {CONSTANTS.PREPROCESSED_DATASET_SHAPE}, but got {df.shape}\" \n",
"print(f\"Dataset loaded successfully\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**We'll deal only with first 15 transactions**"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"df = df.head(15)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ***2.2. Get repeated item sets***"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Repeated item sets using Apriori with min_support = 0.2:\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>support</th>\n",
" <th>itemsets</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.466667</td>\n",
" <td>(Bread)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.266667</td>\n",
" <td>(Coffee)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.333333</td>\n",
" <td>(Medialuna)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.200000</td>\n",
" <td>(Muffin)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.400000</td>\n",
" <td>(Pastry)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>0.200000</td>\n",
" <td>(Scandinavian)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>0.200000</td>\n",
" <td>(Pastry, Bread)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>0.200000</td>\n",
" <td>(Coffee, Pastry)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>0.200000</td>\n",
" <td>(Pastry, Medialuna)</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" support itemsets\n",
"0 0.466667 (Bread)\n",
"1 0.266667 (Coffee)\n",
"2 0.333333 (Medialuna)\n",
"3 0.200000 (Muffin)\n",
"4 0.400000 (Pastry)\n",
"5 0.200000 (Scandinavian)\n",
"6 0.200000 (Pastry, Bread)\n",
"7 0.200000 (Coffee, Pastry)\n",
"8 0.200000 (Pastry, Medialuna)"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"min_support = CONSTANTS.MIN_SUPPORT_VALUE\n",
"repeated_item_sets_apriori = apriori(df, min_support=min_support, use_colnames=True)\n",
"print(f\"Repeated item sets using Apriori with min_support = {min_support}:\")\n",
"repeated_item_sets_apriori"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ***2.3. Get rules***"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Association rules using Apriori with min_support = 0.2 and min_confidence = 0.5:\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>antecedents</th>\n",
" <th>consequents</th>\n",
" <th>antecedent support</th>\n",
" <th>consequent support</th>\n",
" <th>support</th>\n",
" <th>confidence</th>\n",
" <th>lift</th>\n",
" <th>leverage</th>\n",
" <th>conviction</th>\n",
" <th>zhangs_metric</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>(Pastry)</td>\n",
" <td>(Bread)</td>\n",
" <td>0.400000</td>\n",
" <td>0.466667</td>\n",
" <td>0.2</td>\n",
" <td>0.50</td>\n",
" <td>1.071429</td>\n",
" <td>0.013333</td>\n",
" <td>1.066667</td>\n",
" <td>0.111111</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>(Coffee)</td>\n",
" <td>(Pastry)</td>\n",
" <td>0.266667</td>\n",
" <td>0.400000</td>\n",
" <td>0.2</td>\n",
" <td>0.75</td>\n",
" <td>1.875000</td>\n",
" <td>0.093333</td>\n",
" <td>2.400000</td>\n",
" <td>0.636364</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>(Pastry)</td>\n",
" <td>(Coffee)</td>\n",
" <td>0.400000</td>\n",
" <td>0.266667</td>\n",
" <td>0.2</td>\n",
" <td>0.50</td>\n",
" <td>1.875000</td>\n",
" <td>0.093333</td>\n",
" <td>1.466667</td>\n",
" <td>0.777778</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>(Pastry)</td>\n",
" <td>(Medialuna)</td>\n",
" <td>0.400000</td>\n",
" <td>0.333333</td>\n",
" <td>0.2</td>\n",
" <td>0.50</td>\n",
" <td>1.500000</td>\n",
" <td>0.066667</td>\n",
" <td>1.333333</td>\n",
" <td>0.555556</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>(Medialuna)</td>\n",
" <td>(Pastry)</td>\n",
" <td>0.333333</td>\n",
" <td>0.400000</td>\n",
" <td>0.2</td>\n",
" <td>0.60</td>\n",
" <td>1.500000</td>\n",
" <td>0.066667</td>\n",
" <td>1.500000</td>\n",
" <td>0.500000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" antecedents consequents antecedent support consequent support support \\\n",
"0 (Pastry) (Bread) 0.400000 0.466667 0.2 \n",
"1 (Coffee) (Pastry) 0.266667 0.400000 0.2 \n",
"2 (Pastry) (Coffee) 0.400000 0.266667 0.2 \n",
"3 (Pastry) (Medialuna) 0.400000 0.333333 0.2 \n",
"4 (Medialuna) (Pastry) 0.333333 0.400000 0.2 \n",
"\n",
" confidence lift leverage conviction zhangs_metric \n",
"0 0.50 1.071429 0.013333 1.066667 0.111111 \n",
"1 0.75 1.875000 0.093333 2.400000 0.636364 \n",
"2 0.50 1.875000 0.093333 1.466667 0.777778 \n",
"3 0.50 1.500000 0.066667 1.333333 0.555556 \n",
"4 0.60 1.500000 0.066667 1.500000 0.500000 "
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"min_confidence = CONSTANTS.MIN_CONFIDENCE_VALUE\n",
"rules_apriori = association_rules(repeated_item_sets_apriori, metric=\"confidence\", min_threshold=min_confidence)\n",
"print(f\"Association rules using Apriori with min_support = {min_support} and min_confidence = {min_confidence}:\")\n",
"rules_apriori"
]
},
{
"cell_type": "markdown",
"metadata": {},
...
...
@@ -457,6 +783,318 @@
"# ***3. Extracting rules using FP Growth***"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ***3.1. Load dataset***"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dataset loaded successfully\n"
]
}
],
"source": [
"df = None\n",
"df = pd.read_csv(CONSTANTS.PREPROCESSED_DATASET_PATH)\n",
"assert df.shape == CONSTANTS.PREPROCESSED_DATASET_SHAPE, f\"Expected shape {CONSTANTS.PREPROCESSED_DATASET_SHAPE}, but got {df.shape}\" \n",
"print(f\"Dataset loaded successfully\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**We'll deal only with first 15 transactions**"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"df = df.head(15)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ***3.2. Get repeated item sets***\n"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Repeated item sets using FP Growth with min_support = 0.2:\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>support</th>\n",
" <th>itemsets</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.466667</td>\n",
" <td>(Bread)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.200000</td>\n",
" <td>(Scandinavian)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.200000</td>\n",
" <td>(Muffin)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.400000</td>\n",
" <td>(Pastry)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.266667</td>\n",
" <td>(Coffee)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>0.333333</td>\n",
" <td>(Medialuna)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>0.200000</td>\n",
" <td>(Pastry, Bread)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>0.200000</td>\n",
" <td>(Coffee, Pastry)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>0.200000</td>\n",
" <td>(Pastry, Medialuna)</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" support itemsets\n",
"0 0.466667 (Bread)\n",
"1 0.200000 (Scandinavian)\n",
"2 0.200000 (Muffin)\n",
"3 0.400000 (Pastry)\n",
"4 0.266667 (Coffee)\n",
"5 0.333333 (Medialuna)\n",
"6 0.200000 (Pastry, Bread)\n",
"7 0.200000 (Coffee, Pastry)\n",
"8 0.200000 (Pastry, Medialuna)"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"min_support = CONSTANTS.MIN_SUPPORT_VALUE\n",
"repeated_item_sets_fpg = fpgrowth(df, min_support=min_support, use_colnames=True)\n",
"print(f\"Repeated item sets using FP Growth with min_support = {min_support}:\")\n",
"repeated_item_sets_fpg"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ***3.3. Get rules***\n"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Association rules using FP Growth with min_support = 0.2 and min_confidence = 0.5:\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>antecedents</th>\n",
" <th>consequents</th>\n",
" <th>antecedent support</th>\n",
" <th>consequent support</th>\n",
" <th>support</th>\n",
" <th>confidence</th>\n",
" <th>lift</th>\n",
" <th>leverage</th>\n",
" <th>conviction</th>\n",
" <th>zhangs_metric</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>(Pastry)</td>\n",
" <td>(Bread)</td>\n",
" <td>0.400000</td>\n",
" <td>0.466667</td>\n",
" <td>0.2</td>\n",
" <td>0.50</td>\n",
" <td>1.071429</td>\n",
" <td>0.013333</td>\n",
" <td>1.066667</td>\n",
" <td>0.111111</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>(Coffee)</td>\n",
" <td>(Pastry)</td>\n",
" <td>0.266667</td>\n",
" <td>0.400000</td>\n",
" <td>0.2</td>\n",
" <td>0.75</td>\n",
" <td>1.875000</td>\n",
" <td>0.093333</td>\n",
" <td>2.400000</td>\n",
" <td>0.636364</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>(Pastry)</td>\n",
" <td>(Coffee)</td>\n",
" <td>0.400000</td>\n",
" <td>0.266667</td>\n",
" <td>0.2</td>\n",
" <td>0.50</td>\n",
" <td>1.875000</td>\n",
" <td>0.093333</td>\n",
" <td>1.466667</td>\n",
" <td>0.777778</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>(Pastry)</td>\n",
" <td>(Medialuna)</td>\n",
" <td>0.400000</td>\n",
" <td>0.333333</td>\n",
" <td>0.2</td>\n",
" <td>0.50</td>\n",
" <td>1.500000</td>\n",
" <td>0.066667</td>\n",
" <td>1.333333</td>\n",
" <td>0.555556</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>(Medialuna)</td>\n",
" <td>(Pastry)</td>\n",
" <td>0.333333</td>\n",
" <td>0.400000</td>\n",
" <td>0.2</td>\n",
" <td>0.60</td>\n",
" <td>1.500000</td>\n",
" <td>0.066667</td>\n",
" <td>1.500000</td>\n",
" <td>0.500000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" antecedents consequents antecedent support consequent support support \\\n",
"0 (Pastry) (Bread) 0.400000 0.466667 0.2 \n",
"1 (Coffee) (Pastry) 0.266667 0.400000 0.2 \n",
"2 (Pastry) (Coffee) 0.400000 0.266667 0.2 \n",
"3 (Pastry) (Medialuna) 0.400000 0.333333 0.2 \n",
"4 (Medialuna) (Pastry) 0.333333 0.400000 0.2 \n",
"\n",
" confidence lift leverage conviction zhangs_metric \n",
"0 0.50 1.071429 0.013333 1.066667 0.111111 \n",
"1 0.75 1.875000 0.093333 2.400000 0.636364 \n",
"2 0.50 1.875000 0.093333 1.466667 0.777778 \n",
"3 0.50 1.500000 0.066667 1.333333 0.555556 \n",
"4 0.60 1.500000 0.066667 1.500000 0.500000 "
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"min_confidence = CONSTANTS.MIN_CONFIDENCE_VALUE\n",
"rules_fpg = association_rules(repeated_item_sets_apriori, metric=\"confidence\", min_threshold=min_confidence)\n",
"print(f\"Association rules using FP Growth with min_support = {min_support} and min_confidence = {min_confidence}:\")\n",
"rules_fpg"
]
},
{
"cell_type": "markdown",
"metadata": {},
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment