Add Eclat

c8a34262 · Almouhannad Hafez · df109ff0 · c8a34262 · c8a34262
Commit c8a34262 authored Nov 12, 2024 by Almouhannad Hafez
Expand all Hide whitespace changes
Inline Side-by-side

Showing with 430 additions and 5 deletions

2.2.association_rules_eclat.ipynb 2.2.association_rules_eclat.ipynb +397 -0

helpers.py helpers.py +33 -5

No files found.
--- a/2.2.association_rules_eclat.ipynb
+++ b/2.2.association_rules_eclat.ipynb
--- a/helpers.py
+++ b/helpers.py
 import pandas as pd
-from mlxtend.frequent_patterns import apriori, association_rules
+from mlxtend.frequent_patterns import apriori, fpgrowth, association_rules
-from mlxtend.frequent_patterns import fpgrowth
+from pyECLAT import ECLAT
+import numpy as np
 class HELPERS:
    def read_dataset_from_csv(path: str) -> pd.DataFrame:
@@ -38,7 +39,7 @@ class HELPERS:
        Args:
            algorithm (str): Algorithm to use for finding repeated item sets. Must be either 'apriori' or 'fpgrowth'
-            data (pd.DataFrame): Data in form of a pandas DataFrame
+            data (pd.DataFrame): Data in form of a pandas DataFrame (one-hot encoded)
            min_support (float): minimum support threshold for the item sets.
        Returns:
@@ -47,7 +48,7 @@ class HELPERS:
        # Type checking
        assert isinstance(algorithm, str), "Algorithm must be a string"
-        assert algorithm in ["apriori", "fpgrowth"], "Algorithm must be either 'apriori' or 'fpgrowth'"
+        assert algorithm in ["apriori", "fpgrowth", "eclat"], "Algorithm must be either 'apriori' or 'fpgrowth' or 'eclat'"
        assert isinstance(data, pd.DataFrame), "Data must be a pandas DataFrame"
        assert isinstance(min_support, (int, float)) and min_support > 0, "min_support must be a positive number"
@@ -57,10 +58,37 @@ class HELPERS:
            repeated_item_sets_finder = apriori
        elif algorithm == "fpgrowth":
            repeated_item_sets_finder = fpgrowth
+        elif algorithm == "eclat":
+            # Convert one-hot encoded DataFrame to transaction format to be able to use with Eclat module
+            transactions = []
+            for _, row in data.iterrows():
+                transaction = row.index[row == 1].tolist()  # Get items where the value is 1
+                transactions.append(transaction)
+            # Create a DataFrame from the transactions
+            max_length = max(len(x) for x in transactions)  # Find the max length of transactions
+            transactional_data = pd.DataFrame(transactions, columns=range(max_length)).fillna(np.nan)
+            repeated_item_sets_finder = ECLAT(data = transactional_data)            
        # Handle excpetions
        try:
-            repeated_item_sets = repeated_item_sets_finder(data, min_support=min_support, use_colnames=True)
+            repeated_item_sets = None
+            if algorithm == 'apriori' or algorithm == 'fpgrowth':
+                repeated_item_sets = repeated_item_sets_finder(data, min_support=min_support, use_colnames=True)
+            else:
+                _, repeated_item_sets = repeated_item_sets_finder.fit(min_support=min_support, separator=', ', verbose=False)
+                # Fix the result dictionary
+                # Initialize the new dictionary
+                fixed_dict = {'itemsets': [], 'support': []}
+                # Populate the new dictionary
+                for key, value in repeated_item_sets.items():
+                    # Split the key into a list of items
+                    itemset = key.split(', ')
+                    # Append the itemset and its corresponding support value
+                    fixed_dict['itemsets'].append(itemset)
+                    fixed_dict['support'].append(value)
+                repeated_item_sets = pd.DataFrame(fixed_dict)
        except Exception as e:
            raise RuntimeError(f"An error occurred while finding repeated item sets: {str(e)}")