Add helpers for finding rules

8158976c · Almouhannad Hafez · 6f099ed0 · 8158976c · 8158976c
Commit 8158976c authored Oct 29, 2024 by Almouhannad Hafez
Show whitespace changes
Inline Side-by-side

Showing with 103 additions and 17 deletions

helpers.py helpers.py +80 -6

hw1.ipynb hw1.ipynb +23 -11

No files found.
--- a/helpers.py
+++ b/helpers.py
 import pandas as pd
+from mlxtend.frequent_patterns import apriori, association_rules
+from mlxtend.frequent_patterns import fpgrowth
 class HELPERS:
+    def read_dataset_from_csv(path: str) -> pd.DataFrame:
+        """
+        Read a dataset from a .csv file
+        Args:
+            path (str): Path to .csv file to be read
+        Returns:
+            data (pd.DataFrame): Pandas DataFrame containing the data from the CSV file
+        """
-    def read_dataset_from_csv(path):
+        # Type checking
+        assert isinstance(path, str), "path must be a string"
+        # Handle excpetions
        try:
            data = pd.read_csv(path)
            return data
        except FileNotFoundError:
-            print(f"Error: The file at {path} was not found.")
+            print(f"Error: file at {path} was not found")
        except pd.errors.EmptyDataError:
-            print(f"Error: The file at {path} is empty.")
+            print(f"Error: file at {path} is empty")
        except pd.errors.ParserError:
-            print(f"Error: The file at {path} could not be parsed.")
+            print(f"Error: file at {path} could not be parsed")
+        except Exception as e:
+            print(f"Unexpected error occurred: {e}")
+    def find_repeated_item_sets(algorithm: str, data: pd.DataFrame, min_support: float) -> pd.DataFrame:
+        """
+        Find repeated item sets in the given data using the specified algorithm
+        Args:
+            algorithm (str): Algorithm to use for finding repeated item sets. Must be either 'apriori' or 'fpgrowth'
+            data (pd.DataFrame): Data in form of a pandas DataFrame
+            min_support (float): minimum support threshold for the item sets.
+        Returns:
+            repeated_item_sets (pd.DataFrame): DataFrame containing the repeated item sets found in the data
+        """
+        # Type checking
+        assert isinstance(algorithm, str), "Algorithm must be a string"
+        assert algorithm in ["apriori", "fpgrowth"], "Algorithm must be either 'apriori' or 'fpgrowth'"
+        assert isinstance(data, pd.DataFrame), "Data must be a pandas DataFrame"
+        assert isinstance(min_support, (int, float)) and min_support > 0, "min_support must be a positive number"
+        repeated_item_sets_finder = None
+        if algorithm == "apriori":
+            repeated_item_sets_finder = apriori
+        elif algorithm == "fpgrowth":
+            repeated_item_sets_finder = fpgrowth
+        # Handle excpetions
+        try:
+            repeated_item_sets = repeated_item_sets_finder(data, min_support=min_support, use_colnames=True)
        except Exception as e:
-            print(f"An unexpected error occurred: {e}")
+            raise RuntimeError(f"An error occurred while finding repeated item sets: {str(e)}")
\ No newline at end of file
+        return repeated_item_sets
+    def get_rules(repeated_item_sets: pd.DataFrame, min_confidence: float) -> pd.DataFrame:
+        """
+        Generate association rules from repeated item sets
+        Args:
+            repeated_item_sets (pd.DataFrame): DataFrame containing repeated item sets
+            min_confidence (float): Minimum confidence threshold for the rules.
+        Returns:
+            rules (pd.DataFrame): DataFrame containing generated association rules
+        """
+        # Type checking
+        assert isinstance(repeated_item_sets, pd.DataFrame), "repeated_item_sets must be a pandas DataFrame"
+        assert isinstance(min_confidence, (int, float)) and min_confidence > 0, "min_confidence must be a positive number"
+        # Handle excpetions
+        try:
+            rules = association_rules(repeated_item_sets, metric='confidence', min_threshold=min_confidence)
+        except Exception as e:
+            raise RuntimeError(f"An error occurred while generating association rules: {str(e)}")
+        return rules
\ No newline at end of file
--- a/hw1.ipynb
+++ b/hw1.ipynb
@@ -637,7 +637,8 @@
   ],
   "source": [
    "min_support = CONSTANTS.MIN_SUPPORT_VALUE\n",
-    "repeated_item_sets_apriori = apriori(df, min_support=min_support, use_colnames=True)\n",
+    "repeated_item_sets_apriori = HELPERS.find_repeated_item_sets(\n",
+    "    algorithm = 'apriori', data = df, min_support = min_support)\n",
    "print(f\"Repeated item sets using Apriori with min_support = {min_support}:\")\n",
    "repeated_item_sets_apriori"
   ]
@@ -787,7 +788,8 @@
   ],
   "source": [
    "min_confidence = CONSTANTS.MIN_CONFIDENCE_VALUE\n",
-    "rules_apriori = association_rules(repeated_item_sets_apriori, metric=\"confidence\", min_threshold=min_confidence)\n",
+    "rules_apriori = HELPERS.get_rules(\n",
+    "    repeated_item_sets = repeated_item_sets_apriori, min_confidence = min_confidence)\n",
    "print(f\"Association rules using Apriori with min_support = {min_support} and min_confidence = {min_confidence}:\")\n",
    "rules_apriori"
   ]
@@ -956,7 +958,8 @@
   ],
   "source": [
    "min_support = CONSTANTS.MIN_SUPPORT_VALUE\n",
-    "repeated_item_sets_fpg = fpgrowth(df, min_support=min_support, use_colnames=True)\n",
+    "repeated_item_sets_fpg = HELPERS.find_repeated_item_sets(\n",
+    "    algorithm = 'fpgrowth', data = df, min_support = min_support)\n",
    "print(f\"Repeated item sets using FP Growth with min_support = {min_support}:\")\n",
    "repeated_item_sets_fpg"
   ]
@@ -1106,7 +1109,8 @@
   ],
   "source": [
    "min_confidence = CONSTANTS.MIN_CONFIDENCE_VALUE\n",
-    "rules_fpg = association_rules(repeated_item_sets_fpg, metric=\"confidence\", min_threshold=min_confidence)\n",
+    "rules_fpg = HELPERS.get_rules(\n",
+    "    repeated_item_sets = repeated_item_sets_fpg, min_confidence = min_confidence)\n",
    "print(f\"Association rules using FP Growth with min_support = {min_support} and min_confidence = {min_confidence}:\")\n",
    "rules_fpg"
   ]
@@ -1161,16 +1165,20 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Execution time for Apriori: 11.037009716033936 seconds\n"
+      "Execution time for Apriori: 10.674063444137573 seconds\n"
     ]
    }
   ],
   "source": [
    "start_time = time.time()\n",
    "min_support = 0.0001\n",
-    "repeated_item_sets_apriori = apriori(df, min_support=min_support, use_colnames=True)\n",
+    "repeated_item_sets_apriori = HELPERS.find_repeated_item_sets(\n",
+    "    algorithm = 'apriori', data = df, min_support = min_support)\n",
+    "\n",
    "min_confidence = 0.0001\n",
-    "rules_apriori = association_rules(repeated_item_sets_apriori, metric=\"confidence\", min_threshold=min_confidence)\n",
+    "rules_apriori = HELPERS.get_rules(\n",
+    "    repeated_item_sets = repeated_item_sets_apriori, min_confidence = min_confidence)\n",
+    "\n",
    "end_time = time.time()\n",
    "execution_time = end_time - start_time\n",
    "print(f\"Execution time for Apriori: {execution_time} seconds\")"
@@ -1192,16 +1200,20 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Execution time for FP Growth: 2.707993268966675 seconds\n"
+      "Execution time for FP Growth: 2.7980523109436035 seconds\n"
     ]
    }
   ],
   "source": [
    "start_time = time.time()\n",
    "min_support = 0.0001\n",
-    "repeated_item_sets_fpg = fpgrowth(df, min_support=min_support, use_colnames=True)\n",
+    "repeated_item_sets_fpg = HELPERS.find_repeated_item_sets(\n",
+    "    algorithm = 'fpgrowth', data = df, min_support = min_support)\n",
+    "\n",
    "min_confidence = 0.0001\n",
-    "rules_fpg = association_rules(repeated_item_sets_fpg, metric=\"confidence\", min_threshold=min_confidence)\n",
+    "rules_fpg = HELPERS.get_rules(\n",
+    "    repeated_item_sets = repeated_item_sets_fpg, min_confidence = min_confidence)\n",
+    "\n",
    "end_time = time.time()\n",
    "execution_time = end_time - start_time\n",
    "print(f\"Execution time for FP Growth: {execution_time} seconds\")"
@@ -1218,7 +1230,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "> **As we can notice, `FP Growth` is much faster than `Apriori`** ***(about 5 times faster!)***.  \n",
+    "> **As we can notice, `FP Growth` is much faster than `Apriori`** ***(about 4 times faster!)***.  \n",
    "> **This is because `FP Growth` requires access the dataset multiple times to find repeated groups, when `Apriori` constructs the tree from the beginning and then don't access dataset again (working only with tree)**"
   ]
  }