Commit c8a34262 authored by Almouhannad Hafez's avatar Almouhannad Hafez

Add Eclat

parent df109ff0
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ***Contents***\n",
"- **[Setup](#0.-Setup)**\n",
"- **[Load dataset](#1.-Load-dataset)**\n",
"- **[Get repeated item sets](#2.-Get-repeated-item-sets)**\n",
"- **[Get rules](#3.-Get-rules)**"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ***0. Setup***\n",
"[Back to contents](#Contents)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Please note that the following cell may require working VPN to work**"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# %pip install pyECLAT\n",
"# %pip install numpy"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from helpers import HELPERS\n",
"from constants import CONSTANTS\n",
"# Some more magic so that the notebook will reload external python modules;\n",
"# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython\n",
"%load_ext autoreload\n",
"%autoreload 2\n",
"%reload_ext autoreload"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ***1. Load dataset***\n",
"[Back to contents](#Contents)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dataset loaded successfully with shape: (9465, 94)\n"
]
}
],
"source": [
"df = None\n",
"df = HELPERS.read_dataset_from_csv(CONSTANTS.PREPROCESSED_DATASET_PATH)\n",
"assert df.shape == CONSTANTS.PREPROCESSED_DATASET_SHAPE, f\"Expected shape {CONSTANTS.PREPROCESSED_DATASET_SHAPE}, but got {df.shape}\" \n",
"print(\"Dataset loaded successfully with shape:\", df.shape)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**We'll deal only with first 15 transactions**"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"df = df.head(15)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ***2. Get repeated item sets***\n",
"[Back to contents](#Contents)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Repeated item sets using Eclat with min_support = 0.2:\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>itemsets</th>\n",
" <th>support</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>[Coffee]</td>\n",
" <td>0.266667</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>[Medialuna]</td>\n",
" <td>0.333333</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>[Bread]</td>\n",
" <td>0.466667</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>[Muffin]</td>\n",
" <td>0.200000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>[Scandinavian]</td>\n",
" <td>0.200000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>[Pastry]</td>\n",
" <td>0.400000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>[Coffee, Pastry]</td>\n",
" <td>0.200000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>[Medialuna, Pastry]</td>\n",
" <td>0.200000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>[Bread, Pastry]</td>\n",
" <td>0.200000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" itemsets support\n",
"0 [Coffee] 0.266667\n",
"1 [Medialuna] 0.333333\n",
"2 [Bread] 0.466667\n",
"3 [Muffin] 0.200000\n",
"4 [Scandinavian] 0.200000\n",
"5 [Pastry] 0.400000\n",
"6 [Coffee, Pastry] 0.200000\n",
"7 [Medialuna, Pastry] 0.200000\n",
"8 [Bread, Pastry] 0.200000"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"min_support = CONSTANTS.MIN_SUPPORT_VALUE\n",
"repeated_item_sets_eclat = HELPERS.find_repeated_item_sets(\n",
" algorithm = 'eclat',\n",
" data = df,\n",
" min_support = min_support)\n",
"print(f\"Repeated item sets using Eclat with min_support = {min_support}:\")\n",
"repeated_item_sets_eclat"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ***3. Get rules***\n",
"[Back to contents](#Contents)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Association rules using Eclat with min_support = 0.2 and min_confidence = 0.5:\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>antecedents</th>\n",
" <th>consequents</th>\n",
" <th>antecedent support</th>\n",
" <th>consequent support</th>\n",
" <th>support</th>\n",
" <th>confidence</th>\n",
" <th>lift</th>\n",
" <th>leverage</th>\n",
" <th>conviction</th>\n",
" <th>zhangs_metric</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>(Coffee)</td>\n",
" <td>(Pastry)</td>\n",
" <td>0.266667</td>\n",
" <td>0.400000</td>\n",
" <td>0.2</td>\n",
" <td>0.75</td>\n",
" <td>1.875000</td>\n",
" <td>0.093333</td>\n",
" <td>2.400000</td>\n",
" <td>0.636364</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>(Pastry)</td>\n",
" <td>(Coffee)</td>\n",
" <td>0.400000</td>\n",
" <td>0.266667</td>\n",
" <td>0.2</td>\n",
" <td>0.50</td>\n",
" <td>1.875000</td>\n",
" <td>0.093333</td>\n",
" <td>1.466667</td>\n",
" <td>0.777778</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>(Medialuna)</td>\n",
" <td>(Pastry)</td>\n",
" <td>0.333333</td>\n",
" <td>0.400000</td>\n",
" <td>0.2</td>\n",
" <td>0.60</td>\n",
" <td>1.500000</td>\n",
" <td>0.066667</td>\n",
" <td>1.500000</td>\n",
" <td>0.500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>(Pastry)</td>\n",
" <td>(Medialuna)</td>\n",
" <td>0.400000</td>\n",
" <td>0.333333</td>\n",
" <td>0.2</td>\n",
" <td>0.50</td>\n",
" <td>1.500000</td>\n",
" <td>0.066667</td>\n",
" <td>1.333333</td>\n",
" <td>0.555556</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>(Pastry)</td>\n",
" <td>(Bread)</td>\n",
" <td>0.400000</td>\n",
" <td>0.466667</td>\n",
" <td>0.2</td>\n",
" <td>0.50</td>\n",
" <td>1.071429</td>\n",
" <td>0.013333</td>\n",
" <td>1.066667</td>\n",
" <td>0.111111</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" antecedents consequents antecedent support consequent support support \\\n",
"0 (Coffee) (Pastry) 0.266667 0.400000 0.2 \n",
"1 (Pastry) (Coffee) 0.400000 0.266667 0.2 \n",
"2 (Medialuna) (Pastry) 0.333333 0.400000 0.2 \n",
"3 (Pastry) (Medialuna) 0.400000 0.333333 0.2 \n",
"4 (Pastry) (Bread) 0.400000 0.466667 0.2 \n",
"\n",
" confidence lift leverage conviction zhangs_metric \n",
"0 0.75 1.875000 0.093333 2.400000 0.636364 \n",
"1 0.50 1.875000 0.093333 1.466667 0.777778 \n",
"2 0.60 1.500000 0.066667 1.500000 0.500000 \n",
"3 0.50 1.500000 0.066667 1.333333 0.555556 \n",
"4 0.50 1.071429 0.013333 1.066667 0.111111 "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"min_confidence = CONSTANTS.MIN_CONFIDENCE_VALUE\n",
"rules_eclat = HELPERS.get_rules(\n",
" repeated_item_sets = repeated_item_sets_eclat, \n",
" min_confidence = min_confidence\n",
" )\n",
"print(f\"Association rules using Eclat with min_support = {min_support} and min_confidence = {min_confidence}:\")\n",
"rules_eclat"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "ML",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.20"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
import pandas as pd import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules from mlxtend.frequent_patterns import apriori, fpgrowth, association_rules
from mlxtend.frequent_patterns import fpgrowth from pyECLAT import ECLAT
import numpy as np
class HELPERS: class HELPERS:
def read_dataset_from_csv(path: str) -> pd.DataFrame: def read_dataset_from_csv(path: str) -> pd.DataFrame:
...@@ -38,7 +39,7 @@ class HELPERS: ...@@ -38,7 +39,7 @@ class HELPERS:
Args: Args:
algorithm (str): Algorithm to use for finding repeated item sets. Must be either 'apriori' or 'fpgrowth' algorithm (str): Algorithm to use for finding repeated item sets. Must be either 'apriori' or 'fpgrowth'
data (pd.DataFrame): Data in form of a pandas DataFrame data (pd.DataFrame): Data in form of a pandas DataFrame (one-hot encoded)
min_support (float): minimum support threshold for the item sets. min_support (float): minimum support threshold for the item sets.
Returns: Returns:
...@@ -47,7 +48,7 @@ class HELPERS: ...@@ -47,7 +48,7 @@ class HELPERS:
# Type checking # Type checking
assert isinstance(algorithm, str), "Algorithm must be a string" assert isinstance(algorithm, str), "Algorithm must be a string"
assert algorithm in ["apriori", "fpgrowth"], "Algorithm must be either 'apriori' or 'fpgrowth'" assert algorithm in ["apriori", "fpgrowth", "eclat"], "Algorithm must be either 'apriori' or 'fpgrowth' or 'eclat'"
assert isinstance(data, pd.DataFrame), "Data must be a pandas DataFrame" assert isinstance(data, pd.DataFrame), "Data must be a pandas DataFrame"
assert isinstance(min_support, (int, float)) and min_support > 0, "min_support must be a positive number" assert isinstance(min_support, (int, float)) and min_support > 0, "min_support must be a positive number"
...@@ -57,10 +58,37 @@ class HELPERS: ...@@ -57,10 +58,37 @@ class HELPERS:
repeated_item_sets_finder = apriori repeated_item_sets_finder = apriori
elif algorithm == "fpgrowth": elif algorithm == "fpgrowth":
repeated_item_sets_finder = fpgrowth repeated_item_sets_finder = fpgrowth
elif algorithm == "eclat":
# Convert one-hot encoded DataFrame to transaction format to be able to use with Eclat module
transactions = []
for _, row in data.iterrows():
transaction = row.index[row == 1].tolist() # Get items where the value is 1
transactions.append(transaction)
# Create a DataFrame from the transactions
max_length = max(len(x) for x in transactions) # Find the max length of transactions
transactional_data = pd.DataFrame(transactions, columns=range(max_length)).fillna(np.nan)
repeated_item_sets_finder = ECLAT(data = transactional_data)
# Handle excpetions # Handle excpetions
try: try:
repeated_item_sets = repeated_item_sets_finder(data, min_support=min_support, use_colnames=True) repeated_item_sets = None
if algorithm == 'apriori' or algorithm == 'fpgrowth':
repeated_item_sets = repeated_item_sets_finder(data, min_support=min_support, use_colnames=True)
else:
_, repeated_item_sets = repeated_item_sets_finder.fit(min_support=min_support, separator=', ', verbose=False)
# Fix the result dictionary
# Initialize the new dictionary
fixed_dict = {'itemsets': [], 'support': []}
# Populate the new dictionary
for key, value in repeated_item_sets.items():
# Split the key into a list of items
itemset = key.split(', ')
# Append the itemset and its corresponding support value
fixed_dict['itemsets'].append(itemset)
fixed_dict['support'].append(value)
repeated_item_sets = pd.DataFrame(fixed_dict)
except Exception as e: except Exception as e:
raise RuntimeError(f"An error occurred while finding repeated item sets: {str(e)}") raise RuntimeError(f"An error occurred while finding repeated item sets: {str(e)}")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment