Add Eclat

c8a34262 · Almouhannad Hafez · df109ff0 · c8a34262 · c8a34262
Commit c8a34262 authored Nov 12, 2024 by Almouhannad Hafez
Hide whitespace changes
Inline Side-by-side

Showing with 430 additions and 5 deletions

2.2.association_rules_eclat.ipynb 2.2.association_rules_eclat.ipynb +397 -0

helpers.py helpers.py +33 -5

No files found.
--- a/2.2.association_rules_eclat.ipynb
+++ b/2.2.association_rules_eclat.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# ***Contents***\n",
+    "- **[Setup](#0.-Setup)**\n",
+    "- **[Load dataset](#1.-Load-dataset)**\n",
+    "- **[Get repeated item sets](#2.-Get-repeated-item-sets)**\n",
+    "- **[Get rules](#3.-Get-rules)**"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# ***0. Setup***\n",
+    "[Back to contents](#Contents)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Please note that the following cell may require working VPN to work**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %pip install pyECLAT\n",
+    "# %pip install numpy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from helpers import HELPERS\n",
+    "from constants import CONSTANTS\n",
+    "# Some more magic so that the notebook will reload external python modules;\n",
+    "# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "%reload_ext autoreload"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# ***1. Load dataset***\n",
+    "[Back to contents](#Contents)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset loaded successfully with shape: (9465, 94)\n"
+     ]
+    }
+   ],
+   "source": [
+    "df = None\n",
+    "df = HELPERS.read_dataset_from_csv(CONSTANTS.PREPROCESSED_DATASET_PATH)\n",
+    "assert df.shape == CONSTANTS.PREPROCESSED_DATASET_SHAPE, f\"Expected shape {CONSTANTS.PREPROCESSED_DATASET_SHAPE}, but got {df.shape}\" \n",
+    "print(\"Dataset loaded successfully with shape:\", df.shape)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**We'll deal only with first 15 transactions**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = df.head(15)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# ***2. Get repeated item sets***\n",
+    "[Back to contents](#Contents)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Repeated item sets using Eclat with min_support = 0.2:\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>itemsets</th>\n",
+       "      <th>support</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>[Coffee]</td>\n",
+       "      <td>0.266667</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>[Medialuna]</td>\n",
+       "      <td>0.333333</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>[Bread]</td>\n",
+       "      <td>0.466667</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>[Muffin]</td>\n",
+       "      <td>0.200000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>[Scandinavian]</td>\n",
+       "      <td>0.200000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>[Pastry]</td>\n",
+       "      <td>0.400000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>[Coffee, Pastry]</td>\n",
+       "      <td>0.200000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>[Medialuna, Pastry]</td>\n",
+       "      <td>0.200000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>[Bread, Pastry]</td>\n",
+       "      <td>0.200000</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "              itemsets   support\n",
+       "0             [Coffee]  0.266667\n",
+       "1          [Medialuna]  0.333333\n",
+       "2              [Bread]  0.466667\n",
+       "3             [Muffin]  0.200000\n",
+       "4       [Scandinavian]  0.200000\n",
+       "5             [Pastry]  0.400000\n",
+       "6     [Coffee, Pastry]  0.200000\n",
+       "7  [Medialuna, Pastry]  0.200000\n",
+       "8      [Bread, Pastry]  0.200000"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "min_support = CONSTANTS.MIN_SUPPORT_VALUE\n",
+    "repeated_item_sets_eclat = HELPERS.find_repeated_item_sets(\n",
+    "    algorithm = 'eclat',\n",
+    "    data = df,\n",
+    "    min_support = min_support)\n",
+    "print(f\"Repeated item sets using Eclat with min_support = {min_support}:\")\n",
+    "repeated_item_sets_eclat"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# ***3. Get rules***\n",
+    "[Back to contents](#Contents)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Association rules using Eclat with min_support = 0.2 and min_confidence = 0.5:\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>antecedents</th>\n",
+       "      <th>consequents</th>\n",
+       "      <th>antecedent support</th>\n",
+       "      <th>consequent support</th>\n",
+       "      <th>support</th>\n",
+       "      <th>confidence</th>\n",
+       "      <th>lift</th>\n",
+       "      <th>leverage</th>\n",
+       "      <th>conviction</th>\n",
+       "      <th>zhangs_metric</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>(Coffee)</td>\n",
+       "      <td>(Pastry)</td>\n",
+       "      <td>0.266667</td>\n",
+       "      <td>0.400000</td>\n",
+       "      <td>0.2</td>\n",
+       "      <td>0.75</td>\n",
+       "      <td>1.875000</td>\n",
+       "      <td>0.093333</td>\n",
+       "      <td>2.400000</td>\n",
+       "      <td>0.636364</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>(Pastry)</td>\n",
+       "      <td>(Coffee)</td>\n",
+       "      <td>0.400000</td>\n",
+       "      <td>0.266667</td>\n",
+       "      <td>0.2</td>\n",
+       "      <td>0.50</td>\n",
+       "      <td>1.875000</td>\n",
+       "      <td>0.093333</td>\n",
+       "      <td>1.466667</td>\n",
+       "      <td>0.777778</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>(Medialuna)</td>\n",
+       "      <td>(Pastry)</td>\n",
+       "      <td>0.333333</td>\n",
+       "      <td>0.400000</td>\n",
+       "      <td>0.2</td>\n",
+       "      <td>0.60</td>\n",
+       "      <td>1.500000</td>\n",
+       "      <td>0.066667</td>\n",
+       "      <td>1.500000</td>\n",
+       "      <td>0.500000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>(Pastry)</td>\n",
+       "      <td>(Medialuna)</td>\n",
+       "      <td>0.400000</td>\n",
+       "      <td>0.333333</td>\n",
+       "      <td>0.2</td>\n",
+       "      <td>0.50</td>\n",
+       "      <td>1.500000</td>\n",
+       "      <td>0.066667</td>\n",
+       "      <td>1.333333</td>\n",
+       "      <td>0.555556</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>(Pastry)</td>\n",
+       "      <td>(Bread)</td>\n",
+       "      <td>0.400000</td>\n",
+       "      <td>0.466667</td>\n",
+       "      <td>0.2</td>\n",
+       "      <td>0.50</td>\n",
+       "      <td>1.071429</td>\n",
+       "      <td>0.013333</td>\n",
+       "      <td>1.066667</td>\n",
+       "      <td>0.111111</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   antecedents  consequents  antecedent support  consequent support  support  \\\n",
+       "0     (Coffee)     (Pastry)            0.266667            0.400000      0.2   \n",
+       "1     (Pastry)     (Coffee)            0.400000            0.266667      0.2   \n",
+       "2  (Medialuna)     (Pastry)            0.333333            0.400000      0.2   \n",
+       "3     (Pastry)  (Medialuna)            0.400000            0.333333      0.2   \n",
+       "4     (Pastry)      (Bread)            0.400000            0.466667      0.2   \n",
+       "\n",
+       "   confidence      lift  leverage  conviction  zhangs_metric  \n",
+       "0        0.75  1.875000  0.093333    2.400000       0.636364  \n",
+       "1        0.50  1.875000  0.093333    1.466667       0.777778  \n",
+       "2        0.60  1.500000  0.066667    1.500000       0.500000  \n",
+       "3        0.50  1.500000  0.066667    1.333333       0.555556  \n",
+       "4        0.50  1.071429  0.013333    1.066667       0.111111  "
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "min_confidence = CONSTANTS.MIN_CONFIDENCE_VALUE\n",
+    "rules_eclat = HELPERS.get_rules(\n",
+    "    repeated_item_sets = repeated_item_sets_eclat, \n",
+    "    min_confidence = min_confidence\n",
+    "    )\n",
+    "print(f\"Association rules using Eclat with min_support = {min_support} and min_confidence = {min_confidence}:\")\n",
+    "rules_eclat"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "ML",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.20"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/helpers.py
+++ b/helpers.py
 import pandas as pd

-from mlxtend.frequent_patterns import apriori, association_rules
-from mlxtend.frequent_patterns import fpgrowth
+from mlxtend.frequent_patterns import apriori, fpgrowth, association_rules
+from pyECLAT import ECLAT
+import numpy as np

 class HELPERS:
    def read_dataset_from_csv(path: str) -> pd.DataFrame:
@@ -38,7 +39,7 @@ class HELPERS:

        Args:
            algorithm (str): Algorithm to use for finding repeated item sets. Must be either 'apriori' or 'fpgrowth'
-            data (pd.DataFrame): Data in form of a pandas DataFrame
+            data (pd.DataFrame): Data in form of a pandas DataFrame (one-hot encoded)
            min_support (float): minimum support threshold for the item sets.

        Returns:
@@ -47,7 +48,7 @@ class HELPERS:
        
        # Type checking
        assert isinstance(algorithm, str), "Algorithm must be a string"
-        assert algorithm in ["apriori", "fpgrowth"], "Algorithm must be either 'apriori' or 'fpgrowth'"
+        assert algorithm in ["apriori", "fpgrowth", "eclat"], "Algorithm must be either 'apriori' or 'fpgrowth' or 'eclat'"
        assert isinstance(data, pd.DataFrame), "Data must be a pandas DataFrame"
        assert isinstance(min_support, (int, float)) and min_support > 0, "min_support must be a positive number"

@@ -57,10 +58,37 @@ class HELPERS:
            repeated_item_sets_finder = apriori
        elif algorithm == "fpgrowth":
            repeated_item_sets_finder = fpgrowth
+        elif algorithm == "eclat":
+            # Convert one-hot encoded DataFrame to transaction format to be able to use with Eclat module
+            transactions = []
+            for _, row in data.iterrows():
+                transaction = row.index[row == 1].tolist()  # Get items where the value is 1
+                transactions.append(transaction)
+            # Create a DataFrame from the transactions
+            max_length = max(len(x) for x in transactions)  # Find the max length of transactions
+            transactional_data = pd.DataFrame(transactions, columns=range(max_length)).fillna(np.nan)
+            repeated_item_sets_finder = ECLAT(data = transactional_data)            

        # Handle excpetions
        try:
-            repeated_item_sets = repeated_item_sets_finder(data, min_support=min_support, use_colnames=True)
+            repeated_item_sets = None
+            if algorithm == 'apriori' or algorithm == 'fpgrowth':
+                repeated_item_sets = repeated_item_sets_finder(data, min_support=min_support, use_colnames=True)
+            else:
+                _, repeated_item_sets = repeated_item_sets_finder.fit(min_support=min_support, separator=', ', verbose=False)
+                # Fix the result dictionary
+                # Initialize the new dictionary
+                fixed_dict = {'itemsets': [], 'support': []}
+
+                # Populate the new dictionary
+                for key, value in repeated_item_sets.items():
+                    # Split the key into a list of items
+                    itemset = key.split(', ')
+                    # Append the itemset and its corresponding support value
+                    fixed_dict['itemsets'].append(itemset)
+                    fixed_dict['support'].append(value)
+                repeated_item_sets = pd.DataFrame(fixed_dict)
+
        except Exception as e:
            raise RuntimeError(f"An error occurred while finding repeated item sets: {str(e)}")