first init commit

899a8d85 · drnull03 · 899a8d85 · 899a8d85
Commit 899a8d85 authored Jan 17, 2026 by drnull03
Hide whitespace changes
Inline Side-by-side

Showing with 642 additions and 0 deletions

CuPy.ipynb CuPy.ipynb +635 -0

README.md README.md +7 -0

No files found.
--- a/CuPy.ipynb
+++ b/CuPy.ipynb
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "gpuType": "T4"
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "accelerator": "GPU"
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": 2,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "gcMnwWMFK0qG",
+        "outputId": "4c1d6dfa-fa64-494a-960a-8a88dc65ff38"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "hello world\n"
+          ]
+        }
+      ],
+      "source": [
+        "### testing the python kernel\n",
+        "print(\"hello world\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Question 1\n",
+        "\n",
+        "\n",
+        "import numpy as np\n",
+        "import time\n",
+        "\n",
+        "\n",
+        "N = 300\n",
+        "\n",
+        "# two input matrices\n",
+        "A = np.random.rand(N, N)\n",
+        "B = np.random.rand(N, N)\n",
+        "\n",
+        "# creating result matrix\n",
+        "C = np.zeros((N, N))\n",
+        "\n",
+        "# for measuring preformance\n",
+        "start_time = time.time()\n",
+        "\n",
+        "# the real deal multi matrix\n",
+        "for i in range(N):\n",
+        "    for j in range(N):\n",
+        "        for k in range(N):\n",
+        "            C[i][j] += A[i][k] * B[k][j]\n",
+        "\n",
+        "# for preformance purposes\n",
+        "end_time = time.time()\n",
+        "\n",
+        "# calculating time\n",
+        "execution_time = end_time - start_time\n",
+        "\n",
+        "print(\"Matrix size:\", N, \"x\", N)\n",
+        "print(\"Execution time (CPU, triple for-loop):\", execution_time, \"seconds\")\n"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "LynXtrdhNgf8",
+        "outputId": "e108a954-ad19-40e9-ae34-feb263bfc3f9"
+      },
+      "execution_count": 3,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Matrix size: 300 x 300\n",
+            "Execution time (CPU, triple for-loop): 34.43796896934509 seconds\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "#testing the cuda setup before going on to question 2\n",
+        "!nvidia-smi\n",
+        "# got command not found in this step this mean i should change the env\n"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "Uol3qh1QPAne",
+        "outputId": "221de175-f687-444a-8dfe-900b97b8bcfa"
+      },
+      "execution_count": 1,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Sat Jan 17 09:16:15 2026       \n",
+            "+-----------------------------------------------------------------------------------------+\n",
+            "| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |\n",
+            "|-----------------------------------------+------------------------+----------------------+\n",
+            "| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |\n",
+            "| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |\n",
+            "|                                         |                        |               MIG M. |\n",
+            "|=========================================+========================+======================|\n",
+            "|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |\n",
+            "| N/A   62C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |\n",
+            "|                                         |                        |                  N/A |\n",
+            "+-----------------------------------------+------------------------+----------------------+\n",
+            "                                                                                         \n",
+            "+-----------------------------------------------------------------------------------------+\n",
+            "| Processes:                                                                              |\n",
+            "|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |\n",
+            "|        ID   ID                                                               Usage      |\n",
+            "|=========================================================================================|\n",
+            "|  No running processes found                                                             |\n",
+            "+-----------------------------------------------------------------------------------------+\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# some more useless testing just to make sure\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "print(\"CUDA available:\", torch.cuda.is_available())\n",
+        "print(\"CUDA version:\", torch.version.cuda)\n",
+        "print(\"GPU name:\", torch.cuda.get_device_name(0) if torch.cuda.is_available() else \"No GPU\")\n"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "VIFqQT69Piga",
+        "outputId": "44064402-fa9e-4757-eb4b-9c5167d2ec50"
+      },
+      "execution_count": 2,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "CUDA available: True\n",
+            "CUDA version: 12.6\n",
+            "GPU name: Tesla T4\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# this time the testing is done using cupy\n",
+        "\n",
+        "import cupy as cp\n",
+        "\n",
+        "print(\"CuPy version:\", cp.__version__)\n",
+        "print(\"CUDA available:\", cp.cuda.is_available())\n",
+        "print(\"GPU count:\", cp.cuda.runtime.getDeviceCount())\n",
+        "print(\"GPU name:\", cp.cuda.runtime.getDeviceProperties(0)['name'])\n"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "ALW1MFHMPrMt",
+        "outputId": "8c2b3ca1-f295-44e1-c55a-82ab010a9bba"
+      },
+      "execution_count": 3,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "CuPy version: 13.6.0\n",
+            "CUDA available: True\n",
+            "GPU count: 1\n",
+            "GPU name: b'Tesla T4'\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# one final check checking the c compiler just to make sure if even wanted to write c code\n",
+        "\n",
+        "!nvcc --version\n"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "Sk1i4QVbP0f9",
+        "outputId": "4ac769b1-8dbb-4194-bbc9-ecc80b8aa996"
+      },
+      "execution_count": 4,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "nvcc: NVIDIA (R) Cuda compiler driver\n",
+            "Copyright (c) 2005-2024 NVIDIA Corporation\n",
+            "Built on Thu_Jun__6_02:18:23_PDT_2024\n",
+            "Cuda compilation tools, release 12.5, V12.5.82\n",
+            "Build cuda_12.5.r12.5/compiler.34385749_0\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import cupy as cp\n",
+        "import time\n",
+        "\n",
+        "\n",
+        "N = 300\n",
+        "\n",
+        "# creating two metrices directly in the gpu\n",
+        "A_gpu = cp.random.rand(N, N)\n",
+        "B_gpu = cp.random.rand(N, N)\n",
+        "\n",
+        "# this line is added by the AI\n",
+        "# gonna look this up hold on\n",
+        "#The function cp.cuda.Stream.null.synchronize() blocks the CPU (host) until all previously scheduled operations in the default CUDA stream on the GPU have completed\n",
+        "cp.cuda.Stream.null.synchronize()\n",
+        "\n",
+        "#  timing\n",
+        "start_time = time.time()\n",
+        "\n",
+        "# using the matmul built in function\n",
+        "C_gpu = cp.matmul(A_gpu, B_gpu)\n",
+        "\n",
+        "\n",
+        "cp.cuda.Stream.null.synchronize()\n",
+        "\n",
+        "#  timing\n",
+        "end_time = time.time()\n",
+        "\n",
+        "# calc time\n",
+        "execution_time = end_time - start_time\n",
+        "\n",
+        "print(\"Matrix size:\", N, \"x\", N)\n",
+        "print(\"Execution time (GPU using CuPy):\", execution_time, \"seconds\")\n"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "K8NggDFKRAFZ",
+        "outputId": "06cbbf01-6974-487a-eaf3-c109cf9beb5e"
+      },
+      "execution_count": 5,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Matrix size: 300 x 300\n",
+            "Execution time (GPU using CuPy): 0.1387937068939209 seconds\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "we notice big improvment over here\n",
+        "\n",
+        "if we calculate the speed up factor\n",
+        "34.43796896934509/0.13=265"
+      ],
+      "metadata": {
+        "id": "9kghJt_BRjjf"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import cupy as cp\n",
+        "import time\n",
+        "\n",
+        "\n",
+        "N = 300\n",
+        "\n",
+        "# create metrices on gpu\n",
+        "A = cp.random.rand(N, N).astype(cp.float32)\n",
+        "B = cp.random.rand(N, N).astype(cp.float32)\n",
+        "C = cp.zeros((N, N), dtype=cp.float32)\n",
+        "\n",
+        "# c code\n",
+        "kernel_code = r'''\n",
+        "extern \"C\" __global__\n",
+        "void matmul(const float* A, const float* B, float* C, int N) {\n",
+        "\n",
+        "    int row = blockIdx.y * blockDim.y + threadIdx.y;\n",
+        "    int col = blockIdx.x * blockDim.x + threadIdx.x;\n",
+        "\n",
+        "    if (row < N && col < N) {\n",
+        "        float sum = 0.0f;\n",
+        "        for (int k = 0; k < N; k++) {\n",
+        "            sum += A[row * N + k] * B[k * N + col];\n",
+        "        }\n",
+        "        C[row * N + col] = sum;\n",
+        "    }\n",
+        "}\n",
+        "'''\n",
+        "\n",
+        "# compiling the kernel\n",
+        "matmul_kernel = cp.RawKernel(kernel_code, 'matmul')\n"
+      ],
+      "metadata": {
+        "id": "u9XAw7B7Xr2e"
+      },
+      "execution_count": 1,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "\n",
+        "\n",
+        "We use:\n",
+        "\n",
+        "row = blockIdx.y * blockDim.y + threadIdx.y;\n",
+        "col = blockIdx.x * blockDim.x + threadIdx.x;\n",
+        "\n",
+        "\n",
+        "instead of a single index because matrix data is two-dimensional, and mapping GPU threads to a 2D structure is more efficient and natural.\n",
+        "\n",
+        "1️ Matrices are 2D\n",
+        "\n",
+        "A matrix element is identified by (row, column), not by a single number.\n",
+        "\n",
+        "Using (row, col) allows:\n",
+        "\n",
+        "Each GPU thread to compute exactly one matrix element\n",
+        "\n",
+        "A clear mapping between threads and matrix elements:\n",
+        "\n",
+        "𝐶\n",
+        "[\n",
+        "𝑟\n",
+        "𝑜\n",
+        "𝑤\n",
+        "]\n",
+        "[\n",
+        "𝑐\n",
+        "𝑜\n",
+        "𝑙\n",
+        "]\n",
+        "C[row][col]\n",
+        "2️ GPU threads and blocks are naturally 2D\n",
+        "\n",
+        "CUDA supports 1D, 2D, and 3D grids and blocks.\n",
+        "\n",
+        "Using:\n",
+        "\n",
+        "x dimension → columns\n",
+        "\n",
+        "y dimension → rows\n",
+        "\n",
+        "matches the matrix layout directly and avoids extra calculations."
+      ],
+      "metadata": {
+        "id": "mSaLlAhncCxS"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "#first test 16 block size\n",
+        "block_size = (16, 16)\n",
+        "grid_size = (\n",
+        "    (N + block_size[0] - 1) // block_size[0],\n",
+        "    (N + block_size[1] - 1) // block_size[1]\n",
+        ")\n",
+        "\n",
+        "# synch\n",
+        "cp.cuda.Stream.null.synchronize()\n",
+        "\n",
+        "start = time.time()\n",
+        "\n",
+        "matmul_kernel(\n",
+        "    grid_size,\n",
+        "    block_size,\n",
+        "    (A, B, C, N)\n",
+        ")\n",
+        "\n",
+        "# synch\n",
+        "cp.cuda.Stream.null.synchronize()\n",
+        "\n",
+        "end = time.time()\n",
+        "\n",
+        "print(\"Custom kernel execution time:\", end - start, \"seconds\")\n"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "bCHTIqpbYRpU",
+        "outputId": "f8cde82f-a075-4e91-917a-d5c0dfd7e486"
+      },
+      "execution_count": 2,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Custom kernel execution time: 0.0509185791015625 seconds\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "#changing the block size in a for loop\n",
+        "block_sizes = [(8,8), (16,16), (32,32)]\n",
+        "\n",
+        "for bs in block_sizes:\n",
+        "    block = bs\n",
+        "    grid = (\n",
+        "        (N + block[0] - 1) // block[0],\n",
+        "        (N + block[1] - 1) // block[1]\n",
+        "    )\n",
+        "\n",
+        "    cp.cuda.Stream.null.synchronize()\n",
+        "    start = time.time()\n",
+        "\n",
+        "    matmul_kernel(grid, block, (A, B, C, N))\n",
+        "\n",
+        "    cp.cuda.Stream.null.synchronize()\n",
+        "    end = time.time()\n",
+        "\n",
+        "    print(f\"Block size {bs}: {end - start:.4f} seconds\")\n"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "Xd1qB8duYlbV",
+        "outputId": "18565577-0f09-464c-f325-3f7bc75b51cc"
+      },
+      "execution_count": 6,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Block size (8, 8): 0.0004 seconds\n",
+            "Block size (16, 16): 0.0003 seconds\n",
+            "Block size (32, 32): 0.0003 seconds\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "here we can see that the custom c kernel took 0.0509185791015625 and the built in function took longer \"0.1387937068939209\" i kinda expected that because c code is faster but to google the built in function should be faster because it has tiling"
+      ],
+      "metadata": {
+        "id": "HwQbN7IsZDFk"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [],
+      "metadata": {
+        "id": "gX_xjXcsaxTd"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [],
+      "metadata": {
+        "id": "LiK1xHoPaxQr"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "before proceeding i should know what is tiling\n",
+        "According to ChatGPT \"The core idea (one sentence)\n",
+        "\n",
+        "Tiling = cut a big problem into small pieces that fit in fast memory, reuse them many times, then move to the next piece.\""
+      ],
+      "metadata": {
+        "id": "G2gjndpiZvna"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Qesution 5\n",
+        "\n",
+        "import cupy as cp\n",
+        "import time\n",
+        "\n",
+        "\n",
+        "N = 300\n",
+        "TILE = 16\n",
+        "\n",
+        "A = cp.random.rand(N, N).astype(cp.float32)\n",
+        "B = cp.random.rand(N, N).astype(cp.float32)\n",
+        "C = cp.zeros((N, N), dtype=cp.float32)\n",
+        "\n",
+        "# tiled kernel\n",
+        "kernel_code = rf'''\n",
+        "extern \"C\" __global__\n",
+        "void matmul_tiled(const float* A, const float* B, float* C, int N) {{\n",
+        "\n",
+        "    __shared__ float As[{TILE}][{TILE}];\n",
+        "    __shared__ float Bs[{TILE}][{TILE}];\n",
+        "\n",
+        "    int row = blockIdx.y * blockDim.y + threadIdx.y;\n",
+        "    int col = blockIdx.x * blockDim.x + threadIdx.x;\n",
+        "\n",
+        "    float sum = 0.0f;\n",
+        "\n",
+        "    for (int t = 0; t < (N + {TILE} - 1) / {TILE}; t++) {{\n",
+        "\n",
+        "        int A_col = t * {TILE} + threadIdx.x;\n",
+        "        int B_row = t * {TILE} + threadIdx.y;\n",
+        "\n",
+        "        // Load tiles into shared memory\n",
+        "        if (row < N && A_col < N)\n",
+        "            As[threadIdx.y][threadIdx.x] = A[row * N + A_col];\n",
+        "        else\n",
+        "            As[threadIdx.y][threadIdx.x] = 0.0f;\n",
+        "\n",
+        "        if (B_row < N && col < N)\n",
+        "            Bs[threadIdx.y][threadIdx.x] = B[B_row * N + col];\n",
+        "        else\n",
+        "            Bs[threadIdx.y][threadIdx.x] = 0.0f;\n",
+        "\n",
+        "        __syncthreads();\n",
+        "\n",
+        "        // Compute partial result\n",
+        "        for (int k = 0; k < {TILE}; k++)\n",
+        "            sum += As[threadIdx.y][k] * Bs[k][threadIdx.x];\n",
+        "\n",
+        "        __syncthreads();\n",
+        "    }}\n",
+        "\n",
+        "    if (row < N && col < N)\n",
+        "        C[row * N + col] = sum;\n",
+        "}}\n",
+        "'''\n",
+        "\n",
+        "# Compile kernel\n",
+        "matmul_tiled = cp.RawKernel(kernel_code, 'matmul_tiled')\n"
+      ],
+      "metadata": {
+        "id": "UrEyT6Iuce7X"
+      },
+      "execution_count": 8,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "block = (TILE, TILE)\n",
+        "grid = (\n",
+        "    (N + TILE - 1) // TILE,\n",
+        "    (N + TILE - 1) // TILE\n",
+        ")\n",
+        "\n",
+        "cp.cuda.Stream.null.synchronize()\n",
+        "start = time.time()\n",
+        "\n",
+        "matmul_tiled(grid, block, (A, B, C, N))\n",
+        "\n",
+        "cp.cuda.Stream.null.synchronize()\n",
+        "end = time.time()\n",
+        "\n",
+        "print(\"Tiled kernel execution time:\", end - start, \"seconds\")\n"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "aBrJs2Txcrrn",
+        "outputId": "f8b6b81b-58d3-49b2-dfbc-953de6a32874"
+      },
+      "execution_count": 10,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Tiled kernel execution time: 0.0007283687591552734 seconds\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "better preformance overall"
+      ],
+      "metadata": {
+        "id": "rOQdAkTCc2M-"
+      }
+    }
+  ]
+}
\ No newline at end of file
--- a/README.md
+++ b/README.md
+
+
+
+To run the code just open the ipynb in colab or jupter notebook and execute each cell 
+
+
+