Commit 8158976c authored by Almouhannad Hafez's avatar Almouhannad Hafez

Add helpers for finding rules

parent 6f099ed0
import pandas as pd import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.frequent_patterns import fpgrowth
class HELPERS: class HELPERS:
def read_dataset_from_csv(path: str) -> pd.DataFrame:
"""
Read a dataset from a .csv file
Args:
path (str): Path to .csv file to be read
Returns:
data (pd.DataFrame): Pandas DataFrame containing the data from the CSV file
"""
def read_dataset_from_csv(path): # Type checking
assert isinstance(path, str), "path must be a string"
# Handle excpetions
try: try:
data = pd.read_csv(path) data = pd.read_csv(path)
return data return data
except FileNotFoundError: except FileNotFoundError:
print(f"Error: The file at {path} was not found.") print(f"Error: file at {path} was not found")
except pd.errors.EmptyDataError: except pd.errors.EmptyDataError:
print(f"Error: The file at {path} is empty.") print(f"Error: file at {path} is empty")
except pd.errors.ParserError: except pd.errors.ParserError:
print(f"Error: The file at {path} could not be parsed.") print(f"Error: file at {path} could not be parsed")
except Exception as e:
print(f"Unexpected error occurred: {e}")
def find_repeated_item_sets(algorithm: str, data: pd.DataFrame, min_support: float) -> pd.DataFrame:
"""
Find repeated item sets in the given data using the specified algorithm
Args:
algorithm (str): Algorithm to use for finding repeated item sets. Must be either 'apriori' or 'fpgrowth'
data (pd.DataFrame): Data in form of a pandas DataFrame
min_support (float): minimum support threshold for the item sets.
Returns:
repeated_item_sets (pd.DataFrame): DataFrame containing the repeated item sets found in the data
"""
# Type checking
assert isinstance(algorithm, str), "Algorithm must be a string"
assert algorithm in ["apriori", "fpgrowth"], "Algorithm must be either 'apriori' or 'fpgrowth'"
assert isinstance(data, pd.DataFrame), "Data must be a pandas DataFrame"
assert isinstance(min_support, (int, float)) and min_support > 0, "min_support must be a positive number"
repeated_item_sets_finder = None
if algorithm == "apriori":
repeated_item_sets_finder = apriori
elif algorithm == "fpgrowth":
repeated_item_sets_finder = fpgrowth
# Handle excpetions
try:
repeated_item_sets = repeated_item_sets_finder(data, min_support=min_support, use_colnames=True)
except Exception as e: except Exception as e:
print(f"An unexpected error occurred: {e}") raise RuntimeError(f"An error occurred while finding repeated item sets: {str(e)}")
\ No newline at end of file
return repeated_item_sets
def get_rules(repeated_item_sets: pd.DataFrame, min_confidence: float) -> pd.DataFrame:
"""
Generate association rules from repeated item sets
Args:
repeated_item_sets (pd.DataFrame): DataFrame containing repeated item sets
min_confidence (float): Minimum confidence threshold for the rules.
Returns:
rules (pd.DataFrame): DataFrame containing generated association rules
"""
# Type checking
assert isinstance(repeated_item_sets, pd.DataFrame), "repeated_item_sets must be a pandas DataFrame"
assert isinstance(min_confidence, (int, float)) and min_confidence > 0, "min_confidence must be a positive number"
# Handle excpetions
try:
rules = association_rules(repeated_item_sets, metric='confidence', min_threshold=min_confidence)
except Exception as e:
raise RuntimeError(f"An error occurred while generating association rules: {str(e)}")
return rules
\ No newline at end of file
...@@ -637,7 +637,8 @@ ...@@ -637,7 +637,8 @@
], ],
"source": [ "source": [
"min_support = CONSTANTS.MIN_SUPPORT_VALUE\n", "min_support = CONSTANTS.MIN_SUPPORT_VALUE\n",
"repeated_item_sets_apriori = apriori(df, min_support=min_support, use_colnames=True)\n", "repeated_item_sets_apriori = HELPERS.find_repeated_item_sets(\n",
" algorithm = 'apriori', data = df, min_support = min_support)\n",
"print(f\"Repeated item sets using Apriori with min_support = {min_support}:\")\n", "print(f\"Repeated item sets using Apriori with min_support = {min_support}:\")\n",
"repeated_item_sets_apriori" "repeated_item_sets_apriori"
] ]
...@@ -787,7 +788,8 @@ ...@@ -787,7 +788,8 @@
], ],
"source": [ "source": [
"min_confidence = CONSTANTS.MIN_CONFIDENCE_VALUE\n", "min_confidence = CONSTANTS.MIN_CONFIDENCE_VALUE\n",
"rules_apriori = association_rules(repeated_item_sets_apriori, metric=\"confidence\", min_threshold=min_confidence)\n", "rules_apriori = HELPERS.get_rules(\n",
" repeated_item_sets = repeated_item_sets_apriori, min_confidence = min_confidence)\n",
"print(f\"Association rules using Apriori with min_support = {min_support} and min_confidence = {min_confidence}:\")\n", "print(f\"Association rules using Apriori with min_support = {min_support} and min_confidence = {min_confidence}:\")\n",
"rules_apriori" "rules_apriori"
] ]
...@@ -956,7 +958,8 @@ ...@@ -956,7 +958,8 @@
], ],
"source": [ "source": [
"min_support = CONSTANTS.MIN_SUPPORT_VALUE\n", "min_support = CONSTANTS.MIN_SUPPORT_VALUE\n",
"repeated_item_sets_fpg = fpgrowth(df, min_support=min_support, use_colnames=True)\n", "repeated_item_sets_fpg = HELPERS.find_repeated_item_sets(\n",
" algorithm = 'fpgrowth', data = df, min_support = min_support)\n",
"print(f\"Repeated item sets using FP Growth with min_support = {min_support}:\")\n", "print(f\"Repeated item sets using FP Growth with min_support = {min_support}:\")\n",
"repeated_item_sets_fpg" "repeated_item_sets_fpg"
] ]
...@@ -1106,7 +1109,8 @@ ...@@ -1106,7 +1109,8 @@
], ],
"source": [ "source": [
"min_confidence = CONSTANTS.MIN_CONFIDENCE_VALUE\n", "min_confidence = CONSTANTS.MIN_CONFIDENCE_VALUE\n",
"rules_fpg = association_rules(repeated_item_sets_fpg, metric=\"confidence\", min_threshold=min_confidence)\n", "rules_fpg = HELPERS.get_rules(\n",
" repeated_item_sets = repeated_item_sets_fpg, min_confidence = min_confidence)\n",
"print(f\"Association rules using FP Growth with min_support = {min_support} and min_confidence = {min_confidence}:\")\n", "print(f\"Association rules using FP Growth with min_support = {min_support} and min_confidence = {min_confidence}:\")\n",
"rules_fpg" "rules_fpg"
] ]
...@@ -1161,16 +1165,20 @@ ...@@ -1161,16 +1165,20 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"Execution time for Apriori: 11.037009716033936 seconds\n" "Execution time for Apriori: 10.674063444137573 seconds\n"
] ]
} }
], ],
"source": [ "source": [
"start_time = time.time()\n", "start_time = time.time()\n",
"min_support = 0.0001\n", "min_support = 0.0001\n",
"repeated_item_sets_apriori = apriori(df, min_support=min_support, use_colnames=True)\n", "repeated_item_sets_apriori = HELPERS.find_repeated_item_sets(\n",
" algorithm = 'apriori', data = df, min_support = min_support)\n",
"\n",
"min_confidence = 0.0001\n", "min_confidence = 0.0001\n",
"rules_apriori = association_rules(repeated_item_sets_apriori, metric=\"confidence\", min_threshold=min_confidence)\n", "rules_apriori = HELPERS.get_rules(\n",
" repeated_item_sets = repeated_item_sets_apriori, min_confidence = min_confidence)\n",
"\n",
"end_time = time.time()\n", "end_time = time.time()\n",
"execution_time = end_time - start_time\n", "execution_time = end_time - start_time\n",
"print(f\"Execution time for Apriori: {execution_time} seconds\")" "print(f\"Execution time for Apriori: {execution_time} seconds\")"
...@@ -1192,16 +1200,20 @@ ...@@ -1192,16 +1200,20 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"Execution time for FP Growth: 2.707993268966675 seconds\n" "Execution time for FP Growth: 2.7980523109436035 seconds\n"
] ]
} }
], ],
"source": [ "source": [
"start_time = time.time()\n", "start_time = time.time()\n",
"min_support = 0.0001\n", "min_support = 0.0001\n",
"repeated_item_sets_fpg = fpgrowth(df, min_support=min_support, use_colnames=True)\n", "repeated_item_sets_fpg = HELPERS.find_repeated_item_sets(\n",
" algorithm = 'fpgrowth', data = df, min_support = min_support)\n",
"\n",
"min_confidence = 0.0001\n", "min_confidence = 0.0001\n",
"rules_fpg = association_rules(repeated_item_sets_fpg, metric=\"confidence\", min_threshold=min_confidence)\n", "rules_fpg = HELPERS.get_rules(\n",
" repeated_item_sets = repeated_item_sets_fpg, min_confidence = min_confidence)\n",
"\n",
"end_time = time.time()\n", "end_time = time.time()\n",
"execution_time = end_time - start_time\n", "execution_time = end_time - start_time\n",
"print(f\"Execution time for FP Growth: {execution_time} seconds\")" "print(f\"Execution time for FP Growth: {execution_time} seconds\")"
...@@ -1218,7 +1230,7 @@ ...@@ -1218,7 +1230,7 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"> **As we can notice, `FP Growth` is much faster than `Apriori`** ***(about 5 times faster!)***. \n", "> **As we can notice, `FP Growth` is much faster than `Apriori`** ***(about 4 times faster!)***. \n",
"> **This is because `FP Growth` requires access the dataset multiple times to find repeated groups, when `Apriori` constructs the tree from the beginning and then don't access dataset again (working only with tree)**" "> **This is because `FP Growth` requires access the dataset multiple times to find repeated groups, when `Apriori` constructs the tree from the beginning and then don't access dataset again (working only with tree)**"
] ]
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment