Commit 8158976c authored by Almouhannad Hafez's avatar Almouhannad Hafez

Add helpers for finding rules

parent 6f099ed0
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.frequent_patterns import fpgrowth
class HELPERS:
def read_dataset_from_csv(path):
def read_dataset_from_csv(path: str) -> pd.DataFrame:
"""
Read a dataset from a .csv file
Args:
path (str): Path to .csv file to be read
Returns:
data (pd.DataFrame): Pandas DataFrame containing the data from the CSV file
"""
# Type checking
assert isinstance(path, str), "path must be a string"
# Handle excpetions
try:
data = pd.read_csv(path)
return data
except FileNotFoundError:
print(f"Error: The file at {path} was not found.")
print(f"Error: file at {path} was not found")
except pd.errors.EmptyDataError:
print(f"Error: The file at {path} is empty.")
print(f"Error: file at {path} is empty")
except pd.errors.ParserError:
print(f"Error: The file at {path} could not be parsed.")
print(f"Error: file at {path} could not be parsed")
except Exception as e:
print(f"Unexpected error occurred: {e}")
def find_repeated_item_sets(algorithm: str, data: pd.DataFrame, min_support: float) -> pd.DataFrame:
"""
Find repeated item sets in the given data using the specified algorithm
Args:
algorithm (str): Algorithm to use for finding repeated item sets. Must be either 'apriori' or 'fpgrowth'
data (pd.DataFrame): Data in form of a pandas DataFrame
min_support (float): minimum support threshold for the item sets.
Returns:
repeated_item_sets (pd.DataFrame): DataFrame containing the repeated item sets found in the data
"""
# Type checking
assert isinstance(algorithm, str), "Algorithm must be a string"
assert algorithm in ["apriori", "fpgrowth"], "Algorithm must be either 'apriori' or 'fpgrowth'"
assert isinstance(data, pd.DataFrame), "Data must be a pandas DataFrame"
assert isinstance(min_support, (int, float)) and min_support > 0, "min_support must be a positive number"
repeated_item_sets_finder = None
if algorithm == "apriori":
repeated_item_sets_finder = apriori
elif algorithm == "fpgrowth":
repeated_item_sets_finder = fpgrowth
# Handle excpetions
try:
repeated_item_sets = repeated_item_sets_finder(data, min_support=min_support, use_colnames=True)
except Exception as e:
raise RuntimeError(f"An error occurred while finding repeated item sets: {str(e)}")
return repeated_item_sets
def get_rules(repeated_item_sets: pd.DataFrame, min_confidence: float) -> pd.DataFrame:
"""
Generate association rules from repeated item sets
Args:
repeated_item_sets (pd.DataFrame): DataFrame containing repeated item sets
min_confidence (float): Minimum confidence threshold for the rules.
Returns:
rules (pd.DataFrame): DataFrame containing generated association rules
"""
# Type checking
assert isinstance(repeated_item_sets, pd.DataFrame), "repeated_item_sets must be a pandas DataFrame"
assert isinstance(min_confidence, (int, float)) and min_confidence > 0, "min_confidence must be a positive number"
# Handle excpetions
try:
rules = association_rules(repeated_item_sets, metric='confidence', min_threshold=min_confidence)
except Exception as e:
print(f"An unexpected error occurred: {e}")
\ No newline at end of file
raise RuntimeError(f"An error occurred while generating association rules: {str(e)}")
return rules
\ No newline at end of file
......@@ -637,7 +637,8 @@
],
"source": [
"min_support = CONSTANTS.MIN_SUPPORT_VALUE\n",
"repeated_item_sets_apriori = apriori(df, min_support=min_support, use_colnames=True)\n",
"repeated_item_sets_apriori = HELPERS.find_repeated_item_sets(\n",
" algorithm = 'apriori', data = df, min_support = min_support)\n",
"print(f\"Repeated item sets using Apriori with min_support = {min_support}:\")\n",
"repeated_item_sets_apriori"
]
......@@ -787,7 +788,8 @@
],
"source": [
"min_confidence = CONSTANTS.MIN_CONFIDENCE_VALUE\n",
"rules_apriori = association_rules(repeated_item_sets_apriori, metric=\"confidence\", min_threshold=min_confidence)\n",
"rules_apriori = HELPERS.get_rules(\n",
" repeated_item_sets = repeated_item_sets_apriori, min_confidence = min_confidence)\n",
"print(f\"Association rules using Apriori with min_support = {min_support} and min_confidence = {min_confidence}:\")\n",
"rules_apriori"
]
......@@ -956,7 +958,8 @@
],
"source": [
"min_support = CONSTANTS.MIN_SUPPORT_VALUE\n",
"repeated_item_sets_fpg = fpgrowth(df, min_support=min_support, use_colnames=True)\n",
"repeated_item_sets_fpg = HELPERS.find_repeated_item_sets(\n",
" algorithm = 'fpgrowth', data = df, min_support = min_support)\n",
"print(f\"Repeated item sets using FP Growth with min_support = {min_support}:\")\n",
"repeated_item_sets_fpg"
]
......@@ -1106,7 +1109,8 @@
],
"source": [
"min_confidence = CONSTANTS.MIN_CONFIDENCE_VALUE\n",
"rules_fpg = association_rules(repeated_item_sets_fpg, metric=\"confidence\", min_threshold=min_confidence)\n",
"rules_fpg = HELPERS.get_rules(\n",
" repeated_item_sets = repeated_item_sets_fpg, min_confidence = min_confidence)\n",
"print(f\"Association rules using FP Growth with min_support = {min_support} and min_confidence = {min_confidence}:\")\n",
"rules_fpg"
]
......@@ -1161,16 +1165,20 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Execution time for Apriori: 11.037009716033936 seconds\n"
"Execution time for Apriori: 10.674063444137573 seconds\n"
]
}
],
"source": [
"start_time = time.time()\n",
"min_support = 0.0001\n",
"repeated_item_sets_apriori = apriori(df, min_support=min_support, use_colnames=True)\n",
"repeated_item_sets_apriori = HELPERS.find_repeated_item_sets(\n",
" algorithm = 'apriori', data = df, min_support = min_support)\n",
"\n",
"min_confidence = 0.0001\n",
"rules_apriori = association_rules(repeated_item_sets_apriori, metric=\"confidence\", min_threshold=min_confidence)\n",
"rules_apriori = HELPERS.get_rules(\n",
" repeated_item_sets = repeated_item_sets_apriori, min_confidence = min_confidence)\n",
"\n",
"end_time = time.time()\n",
"execution_time = end_time - start_time\n",
"print(f\"Execution time for Apriori: {execution_time} seconds\")"
......@@ -1192,16 +1200,20 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Execution time for FP Growth: 2.707993268966675 seconds\n"
"Execution time for FP Growth: 2.7980523109436035 seconds\n"
]
}
],
"source": [
"start_time = time.time()\n",
"min_support = 0.0001\n",
"repeated_item_sets_fpg = fpgrowth(df, min_support=min_support, use_colnames=True)\n",
"repeated_item_sets_fpg = HELPERS.find_repeated_item_sets(\n",
" algorithm = 'fpgrowth', data = df, min_support = min_support)\n",
"\n",
"min_confidence = 0.0001\n",
"rules_fpg = association_rules(repeated_item_sets_fpg, metric=\"confidence\", min_threshold=min_confidence)\n",
"rules_fpg = HELPERS.get_rules(\n",
" repeated_item_sets = repeated_item_sets_fpg, min_confidence = min_confidence)\n",
"\n",
"end_time = time.time()\n",
"execution_time = end_time - start_time\n",
"print(f\"Execution time for FP Growth: {execution_time} seconds\")"
......@@ -1218,7 +1230,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"> **As we can notice, `FP Growth` is much faster than `Apriori`** ***(about 5 times faster!)***. \n",
"> **As we can notice, `FP Growth` is much faster than `Apriori`** ***(about 4 times faster!)***. \n",
"> **This is because `FP Growth` requires access the dataset multiple times to find repeated groups, when `Apriori` constructs the tree from the beginning and then don't access dataset again (working only with tree)**"
]
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment