Remove hw1.ipynb

df109ff0 · Almouhannad Hafez · 5a890e96 · 5a890e96
Commit df109ff0 authored Nov 12, 2024 by Almouhannad Hafez
Hide whitespace changes
Inline Side-by-side

Showing with 0 additions and 1259 deletions

hw1.ipynb hw1.ipynb +0 -1259

No files found.
--- a/hw1.ipynb
+++ b/hw1.ipynb
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# ***Contents:***\n",
-    "- **0. Setup**\n",
-    "- **1. Data-preprocessing**\n",
-    "- **2. Extracting rules using apriori**\n",
-    "- **3. Extracting rules using FP Growth**\n",
-    "- **4. Performance comparison**\n",
-    "\n",
-    "> You can navigate through contents using `outline` in your jupyter editor"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# ***0. Setup***"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**Please note that the following cell may require working VPN to work**"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%pip install pandas\n",
-    "%pip install mlxtend\n",
-    "%pip install TIME-python"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "\n",
-    "from mlxtend.frequent_patterns import apriori, association_rules\n",
-    "from mlxtend.frequent_patterns import fpgrowth\n",
-    "\n",
-    "import time"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from helpers import HELPERS\n",
-    "from constants import CONSTANTS\n",
-    "# Some more magic so that the notebook will reload external python modules;\n",
-    "# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython\n",
-    "%load_ext autoreload\n",
-    "%autoreload 2\n",
-    "%reload_ext autoreload"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# ***1. Data preprocessing***"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## ***1.1. Load dataset***"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Dataset loaded successfully with shape: (20507, 5)\n"
-     ]
-    }
-   ],
-   "source": [
-    "df = None\n",
-    "df = HELPERS.read_dataset_from_csv(CONSTANTS.DATASET_PATH)\n",
-    "assert df.shape == CONSTANTS.DATASET_SHAPE, f\"Expected shape {CONSTANTS.DATASET_SHAPE}, but got {df.shape}\" \n",
-    "print(\"Dataset loaded successfully with shape:\", df.shape)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## ***1.2. Check null values***"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Null values in each column:\n",
-      "Transaction        0\n",
-      "Item               0\n",
-      "date_time          0\n",
-      "period_day         0\n",
-      "weekday_weekend    0\n",
-      "dtype: int64\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(\"Null values in each column:\")\n",
-    "print(df.isnull().sum())"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**Nothing to do since there is no null values**"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## ***1.3. Check duplicates***"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Number of duplicates in dataset: 1620\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(f\"Number of duplicates in dataset: {df.duplicated().sum()}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**We have 1620 duplicated rows, let's remove them**"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Number of duplicates in dataset: 0\n",
-      "New dataset shape: (18887, 5)\n"
-     ]
-    }
-   ],
-   "source": [
-    "df = df.drop_duplicates()\n",
-    "print(f\"Number of duplicates in dataset: {df.duplicated().sum()}\")\n",
-    "print(f\"New dataset shape: {df.shape}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**Now, let's count number of unique items, and total transactions in the dataset**"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Number of transactions in the dataset: 9465\n",
-      "Number of unique items in the dataset: 94\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(f\"Number of transactions in the dataset: {df['Transaction'].nunique()}\")\n",
-    "print(f\"Number of unique items in the dataset: {df['Item'].nunique()}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## ***1.4. Process dataset columns***"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Dataset columns:\n",
-      "Transaction         int64\n",
-      "Item               object\n",
-      "date_time          object\n",
-      "period_day         object\n",
-      "weekday_weekend    object\n",
-      "dtype: object\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(f\"Dataset columns:\")\n",
-    "print(df.dtypes)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**We have 5 columns**\n",
-    "1. **`Transaction`**: Transaction id\n",
-    "1. **`Item`**: Item name\n",
-    "1. **`date_time`**: Date of transaction\n",
-    "1. **`period_day`**: In which period of day (morning, afternoon, ...) the transaction is\n",
-    "1. **`weekday_weeken`**: In weekday or weekend the transaction is\n",
-    "\n",
-    "***Please note:*** **If a transaction contains multiple items, each one will be represented in a seperate row with same id**\n",
-    "\n",
-    "**We are inrested only in `Transaction` and `Item`, so we'll delete other columns and rename them**"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Dataset new columns:\n",
-      "transaction_id     int64\n",
-      "item_name         object\n",
-      "dtype: object\n"
-     ]
-    }
-   ],
-   "source": [
-    "df = df.loc[:, ['Transaction', 'Item']].rename(columns={\n",
-    "    'Transaction': 'transaction_id',\n",
-    "    'Item': 'item_name'\n",
-    "})\n",
-    "\n",
-    "print(f\"Dataset new columns:\")\n",
-    "print(df.dtypes)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## ***1.5. Convert to one-hot-encoding***"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**We'll convert the dataset into one-hot-encoding as following**:  \n",
-    "> - Each row contain 94 features (columns) + 1 feature for transaction_id\n",
-    ">   - 94 is number of unique items\n",
-    ">   - Each feature value is boolean (true meaning that item is in the transaction and vice versa)\n",
-    "> - So, the new shape of dataset will be (9465, 95)\n",
-    "> - We're doing so to be able to use libraries for applying Apriori and FP Growth"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Dataset before one-hot-encoding:\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>transaction_id</th>\n",
-       "      <th>item_name</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>1</td>\n",
-       "      <td>Bread</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>2</td>\n",
-       "      <td>Scandinavian</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>3</td>\n",
-       "      <td>Hot chocolate</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>3</td>\n",
-       "      <td>Jam</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5</th>\n",
-       "      <td>3</td>\n",
-       "      <td>Cookies</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   transaction_id      item_name\n",
-       "0               1          Bread\n",
-       "1               2   Scandinavian\n",
-       "3               3  Hot chocolate\n",
-       "4               3            Jam\n",
-       "5               3        Cookies"
-      ]
-     },
-     "execution_count": 11,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "print(f\"Dataset before one-hot-encoding:\")\n",
-    "df.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "New dataset shape: (9465, 95)\n"
-     ]
-    }
-   ],
-   "source": [
-    "one_hot_encoded = pd.get_dummies(df['item_name'])\n",
-    "df = df[['transaction_id']].join(one_hot_encoded).groupby('transaction_id').sum()\n",
-    "df.reset_index(inplace=True)\n",
-    "print(f\"New dataset shape: {df.shape}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**Let's delete `transaction_id` column sice it's not required, and convert other columns into boolean to save space**:  "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Final dataset shape: (9465, 94)\n",
-      "Dataset columns types after one-hot-encoding:\n",
-      "Adjustment                  bool\n",
-      "Afternoon with the baker    bool\n",
-      "Alfajores                   bool\n",
-      "Argentina Night             bool\n",
-      "Art Tray                    bool\n",
-      "                            ... \n",
-      "Tshirt                      bool\n",
-      "Valentine's card            bool\n",
-      "Vegan Feast                 bool\n",
-      "Vegan mincepie              bool\n",
-      "Victorian Sponge            bool\n",
-      "Length: 94, dtype: object\n"
-     ]
-    }
-   ],
-   "source": [
-    "df = df.drop(columns=['transaction_id'])\n",
-    "df = df.astype(bool)\n",
-    "assert df.shape == CONSTANTS.PREPROCESSED_DATASET_SHAPE, f\"Expected shape {CONSTANTS.PREPROCESSED_DATASET_SHAPE}, but got {df.shape}\" \n",
-    "print(f\"Final dataset shape: {df.shape}\")\n",
-    "print(f\"Dataset columns types after one-hot-encoding:\")\n",
-    "print(df.dtypes)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**Let's save preprocessed dataset in a .csv file**"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df.to_csv(CONSTANTS.PREPROCESSED_DATASET_PATH, index=False)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# ***2. Extracting rules using Apriori***"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## ***2.1. Load dataset***"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Dataset loaded successfully with shape: (9465, 94)\n"
-     ]
-    }
-   ],
-   "source": [
-    "df = None\n",
-    "df = HELPERS.read_dataset_from_csv(CONSTANTS.PREPROCESSED_DATASET_PATH)\n",
-    "assert df.shape == CONSTANTS.PREPROCESSED_DATASET_SHAPE, f\"Expected shape {CONSTANTS.PREPROCESSED_DATASET_SHAPE}, but got {df.shape}\" \n",
-    "print(\"Dataset loaded successfully with shape:\", df.shape)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**We'll deal only with first 15 transactions**"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df = df.head(15)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## ***2.2. Get repeated item sets***"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Repeated item sets using Apriori with min_support = 0.2:\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>support</th>\n",
-       "      <th>itemsets</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>0.466667</td>\n",
-       "      <td>(Bread)</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>0.266667</td>\n",
-       "      <td>(Coffee)</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>0.333333</td>\n",
-       "      <td>(Medialuna)</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>0.200000</td>\n",
-       "      <td>(Muffin)</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>0.400000</td>\n",
-       "      <td>(Pastry)</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5</th>\n",
-       "      <td>0.200000</td>\n",
-       "      <td>(Scandinavian)</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>6</th>\n",
-       "      <td>0.200000</td>\n",
-       "      <td>(Bread, Pastry)</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>7</th>\n",
-       "      <td>0.200000</td>\n",
-       "      <td>(Coffee, Pastry)</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>8</th>\n",
-       "      <td>0.200000</td>\n",
-       "      <td>(Medialuna, Pastry)</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "    support             itemsets\n",
-       "0  0.466667              (Bread)\n",
-       "1  0.266667             (Coffee)\n",
-       "2  0.333333          (Medialuna)\n",
-       "3  0.200000             (Muffin)\n",
-       "4  0.400000             (Pastry)\n",
-       "5  0.200000       (Scandinavian)\n",
-       "6  0.200000      (Bread, Pastry)\n",
-       "7  0.200000     (Coffee, Pastry)\n",
-       "8  0.200000  (Medialuna, Pastry)"
-      ]
-     },
-     "execution_count": 17,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "min_support = CONSTANTS.MIN_SUPPORT_VALUE\n",
-    "repeated_item_sets_apriori = HELPERS.find_repeated_item_sets(\n",
-    "    algorithm = 'apriori', data = df, min_support = min_support)\n",
-    "print(f\"Repeated item sets using Apriori with min_support = {min_support}:\")\n",
-    "repeated_item_sets_apriori"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## ***2.3. Get rules***"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Association rules using Apriori with min_support = 0.2 and min_confidence = 0.5:\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>antecedents</th>\n",
-       "      <th>consequents</th>\n",
-       "      <th>antecedent support</th>\n",
-       "      <th>consequent support</th>\n",
-       "      <th>support</th>\n",
-       "      <th>confidence</th>\n",
-       "      <th>lift</th>\n",
-       "      <th>leverage</th>\n",
-       "      <th>conviction</th>\n",
-       "      <th>zhangs_metric</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>(Pastry)</td>\n",
-       "      <td>(Bread)</td>\n",
-       "      <td>0.400000</td>\n",
-       "      <td>0.466667</td>\n",
-       "      <td>0.2</td>\n",
-       "      <td>0.50</td>\n",
-       "      <td>1.071429</td>\n",
-       "      <td>0.013333</td>\n",
-       "      <td>1.066667</td>\n",
-       "      <td>0.111111</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>(Coffee)</td>\n",
-       "      <td>(Pastry)</td>\n",
-       "      <td>0.266667</td>\n",
-       "      <td>0.400000</td>\n",
-       "      <td>0.2</td>\n",
-       "      <td>0.75</td>\n",
-       "      <td>1.875000</td>\n",
-       "      <td>0.093333</td>\n",
-       "      <td>2.400000</td>\n",
-       "      <td>0.636364</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>(Pastry)</td>\n",
-       "      <td>(Coffee)</td>\n",
-       "      <td>0.400000</td>\n",
-       "      <td>0.266667</td>\n",
-       "      <td>0.2</td>\n",
-       "      <td>0.50</td>\n",
-       "      <td>1.875000</td>\n",
-       "      <td>0.093333</td>\n",
-       "      <td>1.466667</td>\n",
-       "      <td>0.777778</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>(Medialuna)</td>\n",
-       "      <td>(Pastry)</td>\n",
-       "      <td>0.333333</td>\n",
-       "      <td>0.400000</td>\n",
-       "      <td>0.2</td>\n",
-       "      <td>0.60</td>\n",
-       "      <td>1.500000</td>\n",
-       "      <td>0.066667</td>\n",
-       "      <td>1.500000</td>\n",
-       "      <td>0.500000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>(Pastry)</td>\n",
-       "      <td>(Medialuna)</td>\n",
-       "      <td>0.400000</td>\n",
-       "      <td>0.333333</td>\n",
-       "      <td>0.2</td>\n",
-       "      <td>0.50</td>\n",
-       "      <td>1.500000</td>\n",
-       "      <td>0.066667</td>\n",
-       "      <td>1.333333</td>\n",
-       "      <td>0.555556</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   antecedents  consequents  antecedent support  consequent support  support  \\\n",
-       "0     (Pastry)      (Bread)            0.400000            0.466667      0.2   \n",
-       "1     (Coffee)     (Pastry)            0.266667            0.400000      0.2   \n",
-       "2     (Pastry)     (Coffee)            0.400000            0.266667      0.2   \n",
-       "3  (Medialuna)     (Pastry)            0.333333            0.400000      0.2   \n",
-       "4     (Pastry)  (Medialuna)            0.400000            0.333333      0.2   \n",
-       "\n",
-       "   confidence      lift  leverage  conviction  zhangs_metric  \n",
-       "0        0.50  1.071429  0.013333    1.066667       0.111111  \n",
-       "1        0.75  1.875000  0.093333    2.400000       0.636364  \n",
-       "2        0.50  1.875000  0.093333    1.466667       0.777778  \n",
-       "3        0.60  1.500000  0.066667    1.500000       0.500000  \n",
-       "4        0.50  1.500000  0.066667    1.333333       0.555556  "
-      ]
-     },
-     "execution_count": 18,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "min_confidence = CONSTANTS.MIN_CONFIDENCE_VALUE\n",
-    "rules_apriori = HELPERS.get_rules(\n",
-    "    repeated_item_sets = repeated_item_sets_apriori, min_confidence = min_confidence)\n",
-    "print(f\"Association rules using Apriori with min_support = {min_support} and min_confidence = {min_confidence}:\")\n",
-    "rules_apriori"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# ***3. Extracting rules using FP Growth***"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## ***3.1. Load dataset***"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Dataset loaded successfully with shape: (9465, 94)\n"
-     ]
-    }
-   ],
-   "source": [
-    "df = None\n",
-    "df = HELPERS.read_dataset_from_csv(CONSTANTS.PREPROCESSED_DATASET_PATH)\n",
-    "assert df.shape == CONSTANTS.PREPROCESSED_DATASET_SHAPE, f\"Expected shape {CONSTANTS.PREPROCESSED_DATASET_SHAPE}, but got {df.shape}\" \n",
-    "print(\"Dataset loaded successfully with shape:\", df.shape)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**We'll deal only with first 15 transactions**"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df = df.head(15)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## ***3.2. Get repeated item sets***\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Repeated item sets using FP Growth with min_support = 0.2:\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>support</th>\n",
-       "      <th>itemsets</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>0.466667</td>\n",
-       "      <td>(Bread)</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>0.200000</td>\n",
-       "      <td>(Scandinavian)</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>0.200000</td>\n",
-       "      <td>(Muffin)</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>0.400000</td>\n",
-       "      <td>(Pastry)</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>0.266667</td>\n",
-       "      <td>(Coffee)</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5</th>\n",
-       "      <td>0.333333</td>\n",
-       "      <td>(Medialuna)</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>6</th>\n",
-       "      <td>0.200000</td>\n",
-       "      <td>(Bread, Pastry)</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>7</th>\n",
-       "      <td>0.200000</td>\n",
-       "      <td>(Coffee, Pastry)</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>8</th>\n",
-       "      <td>0.200000</td>\n",
-       "      <td>(Medialuna, Pastry)</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "    support             itemsets\n",
-       "0  0.466667              (Bread)\n",
-       "1  0.200000       (Scandinavian)\n",
-       "2  0.200000             (Muffin)\n",
-       "3  0.400000             (Pastry)\n",
-       "4  0.266667             (Coffee)\n",
-       "5  0.333333          (Medialuna)\n",
-       "6  0.200000      (Bread, Pastry)\n",
-       "7  0.200000     (Coffee, Pastry)\n",
-       "8  0.200000  (Medialuna, Pastry)"
-      ]
-     },
-     "execution_count": 21,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "min_support = CONSTANTS.MIN_SUPPORT_VALUE\n",
-    "repeated_item_sets_fpg = HELPERS.find_repeated_item_sets(\n",
-    "    algorithm = 'fpgrowth', data = df, min_support = min_support)\n",
-    "print(f\"Repeated item sets using FP Growth with min_support = {min_support}:\")\n",
-    "repeated_item_sets_fpg"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## ***3.3. Get rules***\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Association rules using FP Growth with min_support = 0.2 and min_confidence = 0.5:\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>antecedents</th>\n",
-       "      <th>consequents</th>\n",
-       "      <th>antecedent support</th>\n",
-       "      <th>consequent support</th>\n",
-       "      <th>support</th>\n",
-       "      <th>confidence</th>\n",
-       "      <th>lift</th>\n",
-       "      <th>leverage</th>\n",
-       "      <th>conviction</th>\n",
-       "      <th>zhangs_metric</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>(Pastry)</td>\n",
-       "      <td>(Bread)</td>\n",
-       "      <td>0.400000</td>\n",
-       "      <td>0.466667</td>\n",
-       "      <td>0.2</td>\n",
-       "      <td>0.50</td>\n",
-       "      <td>1.071429</td>\n",
-       "      <td>0.013333</td>\n",
-       "      <td>1.066667</td>\n",
-       "      <td>0.111111</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>(Coffee)</td>\n",
-       "      <td>(Pastry)</td>\n",
-       "      <td>0.266667</td>\n",
-       "      <td>0.400000</td>\n",
-       "      <td>0.2</td>\n",
-       "      <td>0.75</td>\n",
-       "      <td>1.875000</td>\n",
-       "      <td>0.093333</td>\n",
-       "      <td>2.400000</td>\n",
-       "      <td>0.636364</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>(Pastry)</td>\n",
-       "      <td>(Coffee)</td>\n",
-       "      <td>0.400000</td>\n",
-       "      <td>0.266667</td>\n",
-       "      <td>0.2</td>\n",
-       "      <td>0.50</td>\n",
-       "      <td>1.875000</td>\n",
-       "      <td>0.093333</td>\n",
-       "      <td>1.466667</td>\n",
-       "      <td>0.777778</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>(Medialuna)</td>\n",
-       "      <td>(Pastry)</td>\n",
-       "      <td>0.333333</td>\n",
-       "      <td>0.400000</td>\n",
-       "      <td>0.2</td>\n",
-       "      <td>0.60</td>\n",
-       "      <td>1.500000</td>\n",
-       "      <td>0.066667</td>\n",
-       "      <td>1.500000</td>\n",
-       "      <td>0.500000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>(Pastry)</td>\n",
-       "      <td>(Medialuna)</td>\n",
-       "      <td>0.400000</td>\n",
-       "      <td>0.333333</td>\n",
-       "      <td>0.2</td>\n",
-       "      <td>0.50</td>\n",
-       "      <td>1.500000</td>\n",
-       "      <td>0.066667</td>\n",
-       "      <td>1.333333</td>\n",
-       "      <td>0.555556</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   antecedents  consequents  antecedent support  consequent support  support  \\\n",
-       "0     (Pastry)      (Bread)            0.400000            0.466667      0.2   \n",
-       "1     (Coffee)     (Pastry)            0.266667            0.400000      0.2   \n",
-       "2     (Pastry)     (Coffee)            0.400000            0.266667      0.2   \n",
-       "3  (Medialuna)     (Pastry)            0.333333            0.400000      0.2   \n",
-       "4     (Pastry)  (Medialuna)            0.400000            0.333333      0.2   \n",
-       "\n",
-       "   confidence      lift  leverage  conviction  zhangs_metric  \n",
-       "0        0.50  1.071429  0.013333    1.066667       0.111111  \n",
-       "1        0.75  1.875000  0.093333    2.400000       0.636364  \n",
-       "2        0.50  1.875000  0.093333    1.466667       0.777778  \n",
-       "3        0.60  1.500000  0.066667    1.500000       0.500000  \n",
-       "4        0.50  1.500000  0.066667    1.333333       0.555556  "
-      ]
-     },
-     "execution_count": 22,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "min_confidence = CONSTANTS.MIN_CONFIDENCE_VALUE\n",
-    "rules_fpg = HELPERS.get_rules(\n",
-    "    repeated_item_sets = repeated_item_sets_fpg, min_confidence = min_confidence)\n",
-    "print(f\"Association rules using FP Growth with min_support = {min_support} and min_confidence = {min_confidence}:\")\n",
-    "rules_fpg"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# ***4. Performance comparison***"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## ***4.1. Load dataset***"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Dataset loaded successfully with shape: (9465, 94)\n"
-     ]
-    }
-   ],
-   "source": [
-    "df = None\n",
-    "df = HELPERS.read_dataset_from_csv(CONSTANTS.PREPROCESSED_DATASET_PATH)\n",
-    "assert df.shape == CONSTANTS.PREPROCESSED_DATASET_SHAPE, f\"Expected shape {CONSTANTS.PREPROCESSED_DATASET_SHAPE}, but got {df.shape}\" \n",
-    "print(\"Dataset loaded successfully with shape:\", df.shape)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## ***4.2. Measure time for Apriori***"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Execution time for Apriori: 10.674063444137573 seconds\n"
-     ]
-    }
-   ],
-   "source": [
-    "start_time = time.time()\n",
-    "min_support = 0.0001\n",
-    "repeated_item_sets_apriori = HELPERS.find_repeated_item_sets(\n",
-    "    algorithm = 'apriori', data = df, min_support = min_support)\n",
-    "\n",
-    "min_confidence = 0.0001\n",
-    "rules_apriori = HELPERS.get_rules(\n",
-    "    repeated_item_sets = repeated_item_sets_apriori, min_confidence = min_confidence)\n",
-    "\n",
-    "end_time = time.time()\n",
-    "execution_time = end_time - start_time\n",
-    "print(f\"Execution time for Apriori: {execution_time} seconds\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## ***4.3. Measure time for FP Growth***"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Execution time for FP Growth: 2.7980523109436035 seconds\n"
-     ]
-    }
-   ],
-   "source": [
-    "start_time = time.time()\n",
-    "min_support = 0.0001\n",
-    "repeated_item_sets_fpg = HELPERS.find_repeated_item_sets(\n",
-    "    algorithm = 'fpgrowth', data = df, min_support = min_support)\n",
-    "\n",
-    "min_confidence = 0.0001\n",
-    "rules_fpg = HELPERS.get_rules(\n",
-    "    repeated_item_sets = repeated_item_sets_fpg, min_confidence = min_confidence)\n",
-    "\n",
-    "end_time = time.time()\n",
-    "execution_time = end_time - start_time\n",
-    "print(f\"Execution time for FP Growth: {execution_time} seconds\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## ***4.4. Results***"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "> **As we can notice, `FP Growth` is much faster than `Apriori`** ***(about 4 times faster!)***.  \n",
-    "> **This is because `FP Growth` requires access the dataset multiple times to find repeated groups, when `Apriori` constructs the tree from the beginning and then don't access dataset again (working only with tree)**"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "base",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.7"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}