Use helpers

6f099ed0 · Almouhannad · ac58b5ff · 6f099ed0 · 6f099ed0 · 6f099ed0
Commit 6f099ed0 authored Oct 29, 2024 by Almouhannad
Show whitespace changes
Inline Side-by-side

Showing with 104 additions and 86 deletions

.gitignore .gitignore +1 -0

README.md README.md +3 -1

helpers.py helpers.py +16 -0

hw1.ipynb hw1.ipynb +84 -85

No files found.
--- a/.gitignore
+++ b/.gitignore
 __pycache__/constants.cpython-311.pyc
 data/bread_basket_preprocessed.csv
+__pycache__/helpers.cpython-311.pyc
--- a/README.md
+++ b/README.md
@@ -12,4 +12,6 @@
 >       - **FP Growth**
 > 1. **Performance comparison between the two algorithms**
-> ***This project contains a python file `constants.ipynb` containing some fixed values used in `hw.ipynb`, refered as `CONSTANTS` class***
+> ***This project contains a python file `constants.ipynb` containing some fixed values used in `hw1.ipynb`, refered as `CONSTANTS` class***
+> ***This project contains a python file `helpers.ipynb` containing some helper functions used in `hw1.ipynb`, refered as `HELPERS` class***
\ No newline at end of file
--- a/helpers.py
+++ b/helpers.py
+import pandas as pd
+class HELPERS:
+    def read_dataset_from_csv(path):
+        try:
+            data = pd.read_csv(path)
+            return data
+        except FileNotFoundError:
+            print(f"Error: The file at {path} was not found.")
+        except pd.errors.EmptyDataError:
+            print(f"Error: The file at {path} is empty.")
+        except pd.errors.ParserError:
+            print(f"Error: The file at {path} could not be parsed.")
+        except Exception as e:
+            print(f"An unexpected error occurred: {e}")
\ No newline at end of file
--- a/hw1.ipynb
+++ b/hw1.ipynb
@@ -59,6 +59,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
+    "from helpers import HELPERS\n",
    "from constants import CONSTANTS\n",
    "# Some more magic so that the notebook will reload external python modules;\n",
    "# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython\n",
@@ -95,8 +96,8 @@
    }
   ],
   "source": [
-    "df = pd.read_csv(CONSTANTS.DATASET_PATH)\n",
+    "df = None\n",
-    "df.shape\n",
+    "df = HELPERS.read_dataset_from_csv(CONSTANTS.DATASET_PATH)\n",
    "assert df.shape == CONSTANTS.DATASET_SHAPE, f\"Expected shape {CONSTANTS.DATASET_SHAPE}, but got {df.shape}\" \n",
    "print(\"Dataset loaded successfully with shape:\", df.shape)"
   ]
@@ -469,9 +470,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "df.to_csv(CONSTANTS.PREPROCESSED_DATASET_PATH, index=False)\n",
+    "df.to_csv(CONSTANTS.PREPROCESSED_DATASET_PATH, index=False)"
-    "# Free memory\n",
-    "df = None"
   ]
  },
  {
@@ -497,15 +496,15 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Dataset loaded successfully\n"
+      "Dataset loaded successfully with shape: (9465, 94)\n"
     ]
    }
   ],
   "source": [
    "df = None\n",
-    "df = pd.read_csv(CONSTANTS.PREPROCESSED_DATASET_PATH)\n",
+    "df = HELPERS.read_dataset_from_csv(CONSTANTS.PREPROCESSED_DATASET_PATH)\n",
    "assert df.shape == CONSTANTS.PREPROCESSED_DATASET_SHAPE, f\"Expected shape {CONSTANTS.PREPROCESSED_DATASET_SHAPE}, but got {df.shape}\" \n",
-    "print(f\"Dataset loaded successfully\")"
+    "print(\"Dataset loaded successfully with shape:\", df.shape)"
   ]
  },
  {
@@ -602,17 +601,17 @@
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>0.200000</td>\n",
-       "      <td>(Pastry, Bread)</td>\n",
+       "      <td>(Bread, Pastry)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>0.200000</td>\n",
-       "      <td>(Pastry, Coffee)</td>\n",
+       "      <td>(Coffee, Pastry)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>0.200000</td>\n",
-       "      <td>(Pastry, Medialuna)</td>\n",
+       "      <td>(Medialuna, Pastry)</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
@@ -626,9 +625,9 @@
       "3  0.200000             (Muffin)\n",
       "4  0.400000             (Pastry)\n",
       "5  0.200000       (Scandinavian)\n",
-       "6  0.200000      (Pastry, Bread)\n",
+       "6  0.200000      (Bread, Pastry)\n",
-       "7  0.200000     (Pastry, Coffee)\n",
+       "7  0.200000     (Coffee, Pastry)\n",
-       "8  0.200000  (Pastry, Medialuna)"
+       "8  0.200000  (Medialuna, Pastry)"
      ]
     },
     "execution_count": 17,
@@ -711,55 +710,55 @@
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
-       "      <td>(Pastry)</td>\n",
       "      <td>(Coffee)</td>\n",
-       "      <td>0.400000</td>\n",
+       "      <td>(Pastry)</td>\n",
       "      <td>0.266667</td>\n",
+       "      <td>0.400000</td>\n",
       "      <td>0.2</td>\n",
-       "      <td>0.50</td>\n",
+       "      <td>0.75</td>\n",
       "      <td>1.875000</td>\n",
       "      <td>0.093333</td>\n",
-       "      <td>1.466667</td>\n",
+       "      <td>2.400000</td>\n",
-       "      <td>0.777778</td>\n",
+       "      <td>0.636364</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
-       "      <td>(Coffee)</td>\n",
       "      <td>(Pastry)</td>\n",
-       "      <td>0.266667</td>\n",
+       "      <td>(Coffee)</td>\n",
       "      <td>0.400000</td>\n",
+       "      <td>0.266667</td>\n",
       "      <td>0.2</td>\n",
-       "      <td>0.75</td>\n",
+       "      <td>0.50</td>\n",
       "      <td>1.875000</td>\n",
       "      <td>0.093333</td>\n",
-       "      <td>2.400000</td>\n",
+       "      <td>1.466667</td>\n",
-       "      <td>0.636364</td>\n",
+       "      <td>0.777778</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
-       "      <td>(Pastry)</td>\n",
       "      <td>(Medialuna)</td>\n",
-       "      <td>0.400000</td>\n",
+       "      <td>(Pastry)</td>\n",
       "      <td>0.333333</td>\n",
+       "      <td>0.400000</td>\n",
       "      <td>0.2</td>\n",
-       "      <td>0.50</td>\n",
+       "      <td>0.60</td>\n",
       "      <td>1.500000</td>\n",
       "      <td>0.066667</td>\n",
-       "      <td>1.333333</td>\n",
+       "      <td>1.500000</td>\n",
-       "      <td>0.555556</td>\n",
+       "      <td>0.500000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
-       "      <td>(Medialuna)</td>\n",
       "      <td>(Pastry)</td>\n",
-       "      <td>0.333333</td>\n",
+       "      <td>(Medialuna)</td>\n",
       "      <td>0.400000</td>\n",
+       "      <td>0.333333</td>\n",
       "      <td>0.2</td>\n",
-       "      <td>0.60</td>\n",
+       "      <td>0.50</td>\n",
       "      <td>1.500000</td>\n",
       "      <td>0.066667</td>\n",
-       "      <td>1.500000</td>\n",
+       "      <td>1.333333</td>\n",
-       "      <td>0.500000</td>\n",
+       "      <td>0.555556</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
@@ -768,17 +767,17 @@
      "text/plain": [
       "   antecedents  consequents  antecedent support  consequent support  support  \\\n",
       "0     (Pastry)      (Bread)            0.400000            0.466667      0.2   \n",
-       "1     (Pastry)     (Coffee)            0.400000            0.266667      0.2   \n",
+       "1     (Coffee)     (Pastry)            0.266667            0.400000      0.2   \n",
-       "2     (Coffee)     (Pastry)            0.266667            0.400000      0.2   \n",
+       "2     (Pastry)     (Coffee)            0.400000            0.266667      0.2   \n",
-       "3     (Pastry)  (Medialuna)            0.400000            0.333333      0.2   \n",
+       "3  (Medialuna)     (Pastry)            0.333333            0.400000      0.2   \n",
-       "4  (Medialuna)     (Pastry)            0.333333            0.400000      0.2   \n",
+       "4     (Pastry)  (Medialuna)            0.400000            0.333333      0.2   \n",
       "\n",
       "   confidence      lift  leverage  conviction  zhangs_metric  \n",
       "0        0.50  1.071429  0.013333    1.066667       0.111111  \n",
-       "1        0.50  1.875000  0.093333    1.466667       0.777778  \n",
+       "1        0.75  1.875000  0.093333    2.400000       0.636364  \n",
-       "2        0.75  1.875000  0.093333    2.400000       0.636364  \n",
+       "2        0.50  1.875000  0.093333    1.466667       0.777778  \n",
-       "3        0.50  1.500000  0.066667    1.333333       0.555556  \n",
+       "3        0.60  1.500000  0.066667    1.500000       0.500000  \n",
-       "4        0.60  1.500000  0.066667    1.500000       0.500000  "
+       "4        0.50  1.500000  0.066667    1.333333       0.555556  "
      ]
     },
     "execution_count": 18,
@@ -816,15 +815,15 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Dataset loaded successfully\n"
+      "Dataset loaded successfully with shape: (9465, 94)\n"
     ]
    }
   ],
   "source": [
    "df = None\n",
-    "df = pd.read_csv(CONSTANTS.PREPROCESSED_DATASET_PATH)\n",
+    "df = HELPERS.read_dataset_from_csv(CONSTANTS.PREPROCESSED_DATASET_PATH)\n",
    "assert df.shape == CONSTANTS.PREPROCESSED_DATASET_SHAPE, f\"Expected shape {CONSTANTS.PREPROCESSED_DATASET_SHAPE}, but got {df.shape}\" \n",
-    "print(f\"Dataset loaded successfully\")"
+    "print(\"Dataset loaded successfully with shape:\", df.shape)"
   ]
  },
  {
@@ -921,17 +920,17 @@
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>0.200000</td>\n",
-       "      <td>(Pastry, Bread)</td>\n",
+       "      <td>(Bread, Pastry)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>0.200000</td>\n",
-       "      <td>(Pastry, Coffee)</td>\n",
+       "      <td>(Coffee, Pastry)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>0.200000</td>\n",
-       "      <td>(Pastry, Medialuna)</td>\n",
+       "      <td>(Medialuna, Pastry)</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
@@ -945,9 +944,9 @@
       "3  0.400000             (Pastry)\n",
       "4  0.266667             (Coffee)\n",
       "5  0.333333          (Medialuna)\n",
-       "6  0.200000      (Pastry, Bread)\n",
+       "6  0.200000      (Bread, Pastry)\n",
-       "7  0.200000     (Pastry, Coffee)\n",
+       "7  0.200000     (Coffee, Pastry)\n",
-       "8  0.200000  (Pastry, Medialuna)"
+       "8  0.200000  (Medialuna, Pastry)"
      ]
     },
     "execution_count": 21,
@@ -1030,55 +1029,55 @@
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
-       "      <td>(Pastry)</td>\n",
       "      <td>(Coffee)</td>\n",
-       "      <td>0.400000</td>\n",
+       "      <td>(Pastry)</td>\n",
       "      <td>0.266667</td>\n",
+       "      <td>0.400000</td>\n",
       "      <td>0.2</td>\n",
-       "      <td>0.50</td>\n",
+       "      <td>0.75</td>\n",
       "      <td>1.875000</td>\n",
       "      <td>0.093333</td>\n",
-       "      <td>1.466667</td>\n",
+       "      <td>2.400000</td>\n",
-       "      <td>0.777778</td>\n",
+       "      <td>0.636364</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
-       "      <td>(Coffee)</td>\n",
       "      <td>(Pastry)</td>\n",
-       "      <td>0.266667</td>\n",
+       "      <td>(Coffee)</td>\n",
       "      <td>0.400000</td>\n",
+       "      <td>0.266667</td>\n",
       "      <td>0.2</td>\n",
-       "      <td>0.75</td>\n",
+       "      <td>0.50</td>\n",
       "      <td>1.875000</td>\n",
       "      <td>0.093333</td>\n",
-       "      <td>2.400000</td>\n",
+       "      <td>1.466667</td>\n",
-       "      <td>0.636364</td>\n",
+       "      <td>0.777778</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
-       "      <td>(Pastry)</td>\n",
       "      <td>(Medialuna)</td>\n",
-       "      <td>0.400000</td>\n",
+       "      <td>(Pastry)</td>\n",
       "      <td>0.333333</td>\n",
+       "      <td>0.400000</td>\n",
       "      <td>0.2</td>\n",
-       "      <td>0.50</td>\n",
+       "      <td>0.60</td>\n",
       "      <td>1.500000</td>\n",
       "      <td>0.066667</td>\n",
-       "      <td>1.333333</td>\n",
+       "      <td>1.500000</td>\n",
-       "      <td>0.555556</td>\n",
+       "      <td>0.500000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
-       "      <td>(Medialuna)</td>\n",
       "      <td>(Pastry)</td>\n",
-       "      <td>0.333333</td>\n",
+       "      <td>(Medialuna)</td>\n",
       "      <td>0.400000</td>\n",
+       "      <td>0.333333</td>\n",
       "      <td>0.2</td>\n",
-       "      <td>0.60</td>\n",
+       "      <td>0.50</td>\n",
       "      <td>1.500000</td>\n",
       "      <td>0.066667</td>\n",
-       "      <td>1.500000</td>\n",
+       "      <td>1.333333</td>\n",
-       "      <td>0.500000</td>\n",
+       "      <td>0.555556</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
@@ -1087,17 +1086,17 @@
      "text/plain": [
       "   antecedents  consequents  antecedent support  consequent support  support  \\\n",
       "0     (Pastry)      (Bread)            0.400000            0.466667      0.2   \n",
-       "1     (Pastry)     (Coffee)            0.400000            0.266667      0.2   \n",
+       "1     (Coffee)     (Pastry)            0.266667            0.400000      0.2   \n",
-       "2     (Coffee)     (Pastry)            0.266667            0.400000      0.2   \n",
+       "2     (Pastry)     (Coffee)            0.400000            0.266667      0.2   \n",
-       "3     (Pastry)  (Medialuna)            0.400000            0.333333      0.2   \n",
+       "3  (Medialuna)     (Pastry)            0.333333            0.400000      0.2   \n",
-       "4  (Medialuna)     (Pastry)            0.333333            0.400000      0.2   \n",
+       "4     (Pastry)  (Medialuna)            0.400000            0.333333      0.2   \n",
       "\n",
       "   confidence      lift  leverage  conviction  zhangs_metric  \n",
       "0        0.50  1.071429  0.013333    1.066667       0.111111  \n",
-       "1        0.50  1.875000  0.093333    1.466667       0.777778  \n",
+       "1        0.75  1.875000  0.093333    2.400000       0.636364  \n",
-       "2        0.75  1.875000  0.093333    2.400000       0.636364  \n",
+       "2        0.50  1.875000  0.093333    1.466667       0.777778  \n",
-       "3        0.50  1.500000  0.066667    1.333333       0.555556  \n",
+       "3        0.60  1.500000  0.066667    1.500000       0.500000  \n",
-       "4        0.60  1.500000  0.066667    1.500000       0.500000  "
+       "4        0.50  1.500000  0.066667    1.333333       0.555556  "
      ]
     },
     "execution_count": 22,
@@ -1135,15 +1134,15 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Dataset loaded successfully\n"
+      "Dataset loaded successfully with shape: (9465, 94)\n"
     ]
    }
   ],
   "source": [
    "df = None\n",
-    "df = pd.read_csv(CONSTANTS.PREPROCESSED_DATASET_PATH)\n",
+    "df = HELPERS.read_dataset_from_csv(CONSTANTS.PREPROCESSED_DATASET_PATH)\n",
    "assert df.shape == CONSTANTS.PREPROCESSED_DATASET_SHAPE, f\"Expected shape {CONSTANTS.PREPROCESSED_DATASET_SHAPE}, but got {df.shape}\" \n",
-    "print(f\"Dataset loaded successfully\")"
+    "print(\"Dataset loaded successfully with shape:\", df.shape)"
   ]
  },
  {
@@ -1162,7 +1161,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Execution time for Apriori: 11.543023109436035 seconds\n"
+      "Execution time for Apriori: 11.037009716033936 seconds\n"
     ]
    }
   ],
@@ -1193,7 +1192,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Execution time for FP Growth: 2.7200090885162354 seconds\n"
+      "Execution time for FP Growth: 2.707993268966675 seconds\n"
     ]
    }
   ],
@@ -1219,7 +1218,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "> **As we can notice, `FP Growth` is much faster than `Apriori` ***(about 5 times faster!)***.**  \n",
+    "> **As we can notice, `FP Growth` is much faster than `Apriori`** ***(about 5 times faster!)***.  \n",
    "> **This is because `FP Growth` requires access the dataset multiple times to find repeated groups, when `Apriori` constructs the tree from the beginning and then don't access dataset again (working only with tree)**"
   ]
  }