Commit c8a34262 authored by Almouhannad Hafez's avatar Almouhannad Hafez

Add Eclat

parent df109ff0
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ***Contents***\n",
"- **[Setup](#0.-Setup)**\n",
"- **[Load dataset](#1.-Load-dataset)**\n",
"- **[Get repeated item sets](#2.-Get-repeated-item-sets)**\n",
"- **[Get rules](#3.-Get-rules)**"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ***0. Setup***\n",
"[Back to contents](#Contents)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Please note that the following cell may require working VPN to work**"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# %pip install pyECLAT\n",
"# %pip install numpy"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from helpers import HELPERS\n",
"from constants import CONSTANTS\n",
"# Some more magic so that the notebook will reload external python modules;\n",
"# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython\n",
"%load_ext autoreload\n",
"%autoreload 2\n",
"%reload_ext autoreload"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ***1. Load dataset***\n",
"[Back to contents](#Contents)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dataset loaded successfully with shape: (9465, 94)\n"
]
}
],
"source": [
"df = None\n",
"df = HELPERS.read_dataset_from_csv(CONSTANTS.PREPROCESSED_DATASET_PATH)\n",
"assert df.shape == CONSTANTS.PREPROCESSED_DATASET_SHAPE, f\"Expected shape {CONSTANTS.PREPROCESSED_DATASET_SHAPE}, but got {df.shape}\" \n",
"print(\"Dataset loaded successfully with shape:\", df.shape)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**We'll deal only with first 15 transactions**"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"df = df.head(15)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ***2. Get repeated item sets***\n",
"[Back to contents](#Contents)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Repeated item sets using Eclat with min_support = 0.2:\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>itemsets</th>\n",
" <th>support</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>[Coffee]</td>\n",
" <td>0.266667</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>[Medialuna]</td>\n",
" <td>0.333333</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>[Bread]</td>\n",
" <td>0.466667</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>[Muffin]</td>\n",
" <td>0.200000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>[Scandinavian]</td>\n",
" <td>0.200000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>[Pastry]</td>\n",
" <td>0.400000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>[Coffee, Pastry]</td>\n",
" <td>0.200000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>[Medialuna, Pastry]</td>\n",
" <td>0.200000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>[Bread, Pastry]</td>\n",
" <td>0.200000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" itemsets support\n",
"0 [Coffee] 0.266667\n",
"1 [Medialuna] 0.333333\n",
"2 [Bread] 0.466667\n",
"3 [Muffin] 0.200000\n",
"4 [Scandinavian] 0.200000\n",
"5 [Pastry] 0.400000\n",
"6 [Coffee, Pastry] 0.200000\n",
"7 [Medialuna, Pastry] 0.200000\n",
"8 [Bread, Pastry] 0.200000"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"min_support = CONSTANTS.MIN_SUPPORT_VALUE\n",
"repeated_item_sets_eclat = HELPERS.find_repeated_item_sets(\n",
" algorithm = 'eclat',\n",
" data = df,\n",
" min_support = min_support)\n",
"print(f\"Repeated item sets using Eclat with min_support = {min_support}:\")\n",
"repeated_item_sets_eclat"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ***3. Get rules***\n",
"[Back to contents](#Contents)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Association rules using Eclat with min_support = 0.2 and min_confidence = 0.5:\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>antecedents</th>\n",
" <th>consequents</th>\n",
" <th>antecedent support</th>\n",
" <th>consequent support</th>\n",
" <th>support</th>\n",
" <th>confidence</th>\n",
" <th>lift</th>\n",
" <th>leverage</th>\n",
" <th>conviction</th>\n",
" <th>zhangs_metric</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>(Coffee)</td>\n",
" <td>(Pastry)</td>\n",
" <td>0.266667</td>\n",
" <td>0.400000</td>\n",
" <td>0.2</td>\n",
" <td>0.75</td>\n",
" <td>1.875000</td>\n",
" <td>0.093333</td>\n",
" <td>2.400000</td>\n",
" <td>0.636364</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>(Pastry)</td>\n",
" <td>(Coffee)</td>\n",
" <td>0.400000</td>\n",
" <td>0.266667</td>\n",
" <td>0.2</td>\n",
" <td>0.50</td>\n",
" <td>1.875000</td>\n",
" <td>0.093333</td>\n",
" <td>1.466667</td>\n",
" <td>0.777778</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>(Medialuna)</td>\n",
" <td>(Pastry)</td>\n",
" <td>0.333333</td>\n",
" <td>0.400000</td>\n",
" <td>0.2</td>\n",
" <td>0.60</td>\n",
" <td>1.500000</td>\n",
" <td>0.066667</td>\n",
" <td>1.500000</td>\n",
" <td>0.500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>(Pastry)</td>\n",
" <td>(Medialuna)</td>\n",
" <td>0.400000</td>\n",
" <td>0.333333</td>\n",
" <td>0.2</td>\n",
" <td>0.50</td>\n",
" <td>1.500000</td>\n",
" <td>0.066667</td>\n",
" <td>1.333333</td>\n",
" <td>0.555556</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>(Pastry)</td>\n",
" <td>(Bread)</td>\n",
" <td>0.400000</td>\n",
" <td>0.466667</td>\n",
" <td>0.2</td>\n",
" <td>0.50</td>\n",
" <td>1.071429</td>\n",
" <td>0.013333</td>\n",
" <td>1.066667</td>\n",
" <td>0.111111</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" antecedents consequents antecedent support consequent support support \\\n",
"0 (Coffee) (Pastry) 0.266667 0.400000 0.2 \n",
"1 (Pastry) (Coffee) 0.400000 0.266667 0.2 \n",
"2 (Medialuna) (Pastry) 0.333333 0.400000 0.2 \n",
"3 (Pastry) (Medialuna) 0.400000 0.333333 0.2 \n",
"4 (Pastry) (Bread) 0.400000 0.466667 0.2 \n",
"\n",
" confidence lift leverage conviction zhangs_metric \n",
"0 0.75 1.875000 0.093333 2.400000 0.636364 \n",
"1 0.50 1.875000 0.093333 1.466667 0.777778 \n",
"2 0.60 1.500000 0.066667 1.500000 0.500000 \n",
"3 0.50 1.500000 0.066667 1.333333 0.555556 \n",
"4 0.50 1.071429 0.013333 1.066667 0.111111 "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"min_confidence = CONSTANTS.MIN_CONFIDENCE_VALUE\n",
"rules_eclat = HELPERS.get_rules(\n",
" repeated_item_sets = repeated_item_sets_eclat, \n",
" min_confidence = min_confidence\n",
" )\n",
"print(f\"Association rules using Eclat with min_support = {min_support} and min_confidence = {min_confidence}:\")\n",
"rules_eclat"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "ML",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.20"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.frequent_patterns import apriori, fpgrowth, association_rules
from pyECLAT import ECLAT
import numpy as np
class HELPERS:
def read_dataset_from_csv(path: str) -> pd.DataFrame:
......@@ -38,7 +39,7 @@ class HELPERS:
Args:
algorithm (str): Algorithm to use for finding repeated item sets. Must be either 'apriori' or 'fpgrowth'
data (pd.DataFrame): Data in form of a pandas DataFrame
data (pd.DataFrame): Data in form of a pandas DataFrame (one-hot encoded)
min_support (float): minimum support threshold for the item sets.
Returns:
......@@ -47,7 +48,7 @@ class HELPERS:
# Type checking
assert isinstance(algorithm, str), "Algorithm must be a string"
assert algorithm in ["apriori", "fpgrowth"], "Algorithm must be either 'apriori' or 'fpgrowth'"
assert algorithm in ["apriori", "fpgrowth", "eclat"], "Algorithm must be either 'apriori' or 'fpgrowth' or 'eclat'"
assert isinstance(data, pd.DataFrame), "Data must be a pandas DataFrame"
assert isinstance(min_support, (int, float)) and min_support > 0, "min_support must be a positive number"
......@@ -57,10 +58,37 @@ class HELPERS:
repeated_item_sets_finder = apriori
elif algorithm == "fpgrowth":
repeated_item_sets_finder = fpgrowth
elif algorithm == "eclat":
# Convert one-hot encoded DataFrame to transaction format to be able to use with Eclat module
transactions = []
for _, row in data.iterrows():
transaction = row.index[row == 1].tolist() # Get items where the value is 1
transactions.append(transaction)
# Create a DataFrame from the transactions
max_length = max(len(x) for x in transactions) # Find the max length of transactions
transactional_data = pd.DataFrame(transactions, columns=range(max_length)).fillna(np.nan)
repeated_item_sets_finder = ECLAT(data = transactional_data)
# Handle excpetions
try:
repeated_item_sets = repeated_item_sets_finder(data, min_support=min_support, use_colnames=True)
repeated_item_sets = None
if algorithm == 'apriori' or algorithm == 'fpgrowth':
repeated_item_sets = repeated_item_sets_finder(data, min_support=min_support, use_colnames=True)
else:
_, repeated_item_sets = repeated_item_sets_finder.fit(min_support=min_support, separator=', ', verbose=False)
# Fix the result dictionary
# Initialize the new dictionary
fixed_dict = {'itemsets': [], 'support': []}
# Populate the new dictionary
for key, value in repeated_item_sets.items():
# Split the key into a list of items
itemset = key.split(', ')
# Append the itemset and its corresponding support value
fixed_dict['itemsets'].append(itemset)
fixed_dict['support'].append(value)
repeated_item_sets = pd.DataFrame(fixed_dict)
except Exception as e:
raise RuntimeError(f"An error occurred while finding repeated item sets: {str(e)}")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment