Commit 899a8d85 authored by drnull03's avatar drnull03

first init commit

parents
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"gpuType": "T4"
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "gcMnwWMFK0qG",
"outputId": "4c1d6dfa-fa64-494a-960a-8a88dc65ff38"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"hello world\n"
]
}
],
"source": [
"### testing the python kernel\n",
"print(\"hello world\")"
]
},
{
"cell_type": "code",
"source": [
"# Question 1\n",
"\n",
"\n",
"import numpy as np\n",
"import time\n",
"\n",
"\n",
"N = 300\n",
"\n",
"# two input matrices\n",
"A = np.random.rand(N, N)\n",
"B = np.random.rand(N, N)\n",
"\n",
"# creating result matrix\n",
"C = np.zeros((N, N))\n",
"\n",
"# for measuring preformance\n",
"start_time = time.time()\n",
"\n",
"# the real deal multi matrix\n",
"for i in range(N):\n",
" for j in range(N):\n",
" for k in range(N):\n",
" C[i][j] += A[i][k] * B[k][j]\n",
"\n",
"# for preformance purposes\n",
"end_time = time.time()\n",
"\n",
"# calculating time\n",
"execution_time = end_time - start_time\n",
"\n",
"print(\"Matrix size:\", N, \"x\", N)\n",
"print(\"Execution time (CPU, triple for-loop):\", execution_time, \"seconds\")\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "LynXtrdhNgf8",
"outputId": "e108a954-ad19-40e9-ae34-feb263bfc3f9"
},
"execution_count": 3,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Matrix size: 300 x 300\n",
"Execution time (CPU, triple for-loop): 34.43796896934509 seconds\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"#testing the cuda setup before going on to question 2\n",
"!nvidia-smi\n",
"# got command not found in this step this mean i should change the env\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Uol3qh1QPAne",
"outputId": "221de175-f687-444a-8dfe-900b97b8bcfa"
},
"execution_count": 1,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Sat Jan 17 09:16:15 2026 \n",
"+-----------------------------------------------------------------------------------------+\n",
"| NVIDIA-SMI 550.54.15 Driver Version: 550.54.15 CUDA Version: 12.4 |\n",
"|-----------------------------------------+------------------------+----------------------+\n",
"| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
"| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |\n",
"| | | MIG M. |\n",
"|=========================================+========================+======================|\n",
"| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n",
"| N/A 62C P8 10W / 70W | 0MiB / 15360MiB | 0% Default |\n",
"| | | N/A |\n",
"+-----------------------------------------+------------------------+----------------------+\n",
" \n",
"+-----------------------------------------------------------------------------------------+\n",
"| Processes: |\n",
"| GPU GI CI PID Type Process name GPU Memory |\n",
"| ID ID Usage |\n",
"|=========================================================================================|\n",
"| No running processes found |\n",
"+-----------------------------------------------------------------------------------------+\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"# some more useless testing just to make sure\n",
"\n",
"import torch\n",
"\n",
"print(\"CUDA available:\", torch.cuda.is_available())\n",
"print(\"CUDA version:\", torch.version.cuda)\n",
"print(\"GPU name:\", torch.cuda.get_device_name(0) if torch.cuda.is_available() else \"No GPU\")\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "VIFqQT69Piga",
"outputId": "44064402-fa9e-4757-eb4b-9c5167d2ec50"
},
"execution_count": 2,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"CUDA available: True\n",
"CUDA version: 12.6\n",
"GPU name: Tesla T4\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"# this time the testing is done using cupy\n",
"\n",
"import cupy as cp\n",
"\n",
"print(\"CuPy version:\", cp.__version__)\n",
"print(\"CUDA available:\", cp.cuda.is_available())\n",
"print(\"GPU count:\", cp.cuda.runtime.getDeviceCount())\n",
"print(\"GPU name:\", cp.cuda.runtime.getDeviceProperties(0)['name'])\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ALW1MFHMPrMt",
"outputId": "8c2b3ca1-f295-44e1-c55a-82ab010a9bba"
},
"execution_count": 3,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"CuPy version: 13.6.0\n",
"CUDA available: True\n",
"GPU count: 1\n",
"GPU name: b'Tesla T4'\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"# one final check checking the c compiler just to make sure if even wanted to write c code\n",
"\n",
"!nvcc --version\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Sk1i4QVbP0f9",
"outputId": "4ac769b1-8dbb-4194-bbc9-ecc80b8aa996"
},
"execution_count": 4,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"nvcc: NVIDIA (R) Cuda compiler driver\n",
"Copyright (c) 2005-2024 NVIDIA Corporation\n",
"Built on Thu_Jun__6_02:18:23_PDT_2024\n",
"Cuda compilation tools, release 12.5, V12.5.82\n",
"Build cuda_12.5.r12.5/compiler.34385749_0\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import cupy as cp\n",
"import time\n",
"\n",
"\n",
"N = 300\n",
"\n",
"# creating two metrices directly in the gpu\n",
"A_gpu = cp.random.rand(N, N)\n",
"B_gpu = cp.random.rand(N, N)\n",
"\n",
"# this line is added by the AI\n",
"# gonna look this up hold on\n",
"#The function cp.cuda.Stream.null.synchronize() blocks the CPU (host) until all previously scheduled operations in the default CUDA stream on the GPU have completed\n",
"cp.cuda.Stream.null.synchronize()\n",
"\n",
"# timing\n",
"start_time = time.time()\n",
"\n",
"# using the matmul built in function\n",
"C_gpu = cp.matmul(A_gpu, B_gpu)\n",
"\n",
"\n",
"cp.cuda.Stream.null.synchronize()\n",
"\n",
"# timing\n",
"end_time = time.time()\n",
"\n",
"# calc time\n",
"execution_time = end_time - start_time\n",
"\n",
"print(\"Matrix size:\", N, \"x\", N)\n",
"print(\"Execution time (GPU using CuPy):\", execution_time, \"seconds\")\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "K8NggDFKRAFZ",
"outputId": "06cbbf01-6974-487a-eaf3-c109cf9beb5e"
},
"execution_count": 5,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Matrix size: 300 x 300\n",
"Execution time (GPU using CuPy): 0.1387937068939209 seconds\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"we notice big improvment over here\n",
"\n",
"if we calculate the speed up factor\n",
"34.43796896934509/0.13=265"
],
"metadata": {
"id": "9kghJt_BRjjf"
}
},
{
"cell_type": "code",
"source": [
"import cupy as cp\n",
"import time\n",
"\n",
"\n",
"N = 300\n",
"\n",
"# create metrices on gpu\n",
"A = cp.random.rand(N, N).astype(cp.float32)\n",
"B = cp.random.rand(N, N).astype(cp.float32)\n",
"C = cp.zeros((N, N), dtype=cp.float32)\n",
"\n",
"# c code\n",
"kernel_code = r'''\n",
"extern \"C\" __global__\n",
"void matmul(const float* A, const float* B, float* C, int N) {\n",
"\n",
" int row = blockIdx.y * blockDim.y + threadIdx.y;\n",
" int col = blockIdx.x * blockDim.x + threadIdx.x;\n",
"\n",
" if (row < N && col < N) {\n",
" float sum = 0.0f;\n",
" for (int k = 0; k < N; k++) {\n",
" sum += A[row * N + k] * B[k * N + col];\n",
" }\n",
" C[row * N + col] = sum;\n",
" }\n",
"}\n",
"'''\n",
"\n",
"# compiling the kernel\n",
"matmul_kernel = cp.RawKernel(kernel_code, 'matmul')\n"
],
"metadata": {
"id": "u9XAw7B7Xr2e"
},
"execution_count": 1,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"\n",
"\n",
"We use:\n",
"\n",
"row = blockIdx.y * blockDim.y + threadIdx.y;\n",
"col = blockIdx.x * blockDim.x + threadIdx.x;\n",
"\n",
"\n",
"instead of a single index because matrix data is two-dimensional, and mapping GPU threads to a 2D structure is more efficient and natural.\n",
"\n",
"1️ Matrices are 2D\n",
"\n",
"A matrix element is identified by (row, column), not by a single number.\n",
"\n",
"Using (row, col) allows:\n",
"\n",
"Each GPU thread to compute exactly one matrix element\n",
"\n",
"A clear mapping between threads and matrix elements:\n",
"\n",
"𝐶\n",
"[\n",
"𝑟\n",
"𝑜\n",
"𝑤\n",
"]\n",
"[\n",
"𝑐\n",
"𝑜\n",
"𝑙\n",
"]\n",
"C[row][col]\n",
"2️ GPU threads and blocks are naturally 2D\n",
"\n",
"CUDA supports 1D, 2D, and 3D grids and blocks.\n",
"\n",
"Using:\n",
"\n",
"x dimension → columns\n",
"\n",
"y dimension → rows\n",
"\n",
"matches the matrix layout directly and avoids extra calculations."
],
"metadata": {
"id": "mSaLlAhncCxS"
}
},
{
"cell_type": "code",
"source": [
"#first test 16 block size\n",
"block_size = (16, 16)\n",
"grid_size = (\n",
" (N + block_size[0] - 1) // block_size[0],\n",
" (N + block_size[1] - 1) // block_size[1]\n",
")\n",
"\n",
"# synch\n",
"cp.cuda.Stream.null.synchronize()\n",
"\n",
"start = time.time()\n",
"\n",
"matmul_kernel(\n",
" grid_size,\n",
" block_size,\n",
" (A, B, C, N)\n",
")\n",
"\n",
"# synch\n",
"cp.cuda.Stream.null.synchronize()\n",
"\n",
"end = time.time()\n",
"\n",
"print(\"Custom kernel execution time:\", end - start, \"seconds\")\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "bCHTIqpbYRpU",
"outputId": "f8cde82f-a075-4e91-917a-d5c0dfd7e486"
},
"execution_count": 2,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Custom kernel execution time: 0.0509185791015625 seconds\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"#changing the block size in a for loop\n",
"block_sizes = [(8,8), (16,16), (32,32)]\n",
"\n",
"for bs in block_sizes:\n",
" block = bs\n",
" grid = (\n",
" (N + block[0] - 1) // block[0],\n",
" (N + block[1] - 1) // block[1]\n",
" )\n",
"\n",
" cp.cuda.Stream.null.synchronize()\n",
" start = time.time()\n",
"\n",
" matmul_kernel(grid, block, (A, B, C, N))\n",
"\n",
" cp.cuda.Stream.null.synchronize()\n",
" end = time.time()\n",
"\n",
" print(f\"Block size {bs}: {end - start:.4f} seconds\")\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Xd1qB8duYlbV",
"outputId": "18565577-0f09-464c-f325-3f7bc75b51cc"
},
"execution_count": 6,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Block size (8, 8): 0.0004 seconds\n",
"Block size (16, 16): 0.0003 seconds\n",
"Block size (32, 32): 0.0003 seconds\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"here we can see that the custom c kernel took 0.0509185791015625 and the built in function took longer \"0.1387937068939209\" i kinda expected that because c code is faster but to google the built in function should be faster because it has tiling"
],
"metadata": {
"id": "HwQbN7IsZDFk"
}
},
{
"cell_type": "markdown",
"source": [],
"metadata": {
"id": "gX_xjXcsaxTd"
}
},
{
"cell_type": "markdown",
"source": [],
"metadata": {
"id": "LiK1xHoPaxQr"
}
},
{
"cell_type": "markdown",
"source": [
"before proceeding i should know what is tiling\n",
"According to ChatGPT \"The core idea (one sentence)\n",
"\n",
"Tiling = cut a big problem into small pieces that fit in fast memory, reuse them many times, then move to the next piece.\""
],
"metadata": {
"id": "G2gjndpiZvna"
}
},
{
"cell_type": "code",
"source": [
"# Qesution 5\n",
"\n",
"import cupy as cp\n",
"import time\n",
"\n",
"\n",
"N = 300\n",
"TILE = 16\n",
"\n",
"A = cp.random.rand(N, N).astype(cp.float32)\n",
"B = cp.random.rand(N, N).astype(cp.float32)\n",
"C = cp.zeros((N, N), dtype=cp.float32)\n",
"\n",
"# tiled kernel\n",
"kernel_code = rf'''\n",
"extern \"C\" __global__\n",
"void matmul_tiled(const float* A, const float* B, float* C, int N) {{\n",
"\n",
" __shared__ float As[{TILE}][{TILE}];\n",
" __shared__ float Bs[{TILE}][{TILE}];\n",
"\n",
" int row = blockIdx.y * blockDim.y + threadIdx.y;\n",
" int col = blockIdx.x * blockDim.x + threadIdx.x;\n",
"\n",
" float sum = 0.0f;\n",
"\n",
" for (int t = 0; t < (N + {TILE} - 1) / {TILE}; t++) {{\n",
"\n",
" int A_col = t * {TILE} + threadIdx.x;\n",
" int B_row = t * {TILE} + threadIdx.y;\n",
"\n",
" // Load tiles into shared memory\n",
" if (row < N && A_col < N)\n",
" As[threadIdx.y][threadIdx.x] = A[row * N + A_col];\n",
" else\n",
" As[threadIdx.y][threadIdx.x] = 0.0f;\n",
"\n",
" if (B_row < N && col < N)\n",
" Bs[threadIdx.y][threadIdx.x] = B[B_row * N + col];\n",
" else\n",
" Bs[threadIdx.y][threadIdx.x] = 0.0f;\n",
"\n",
" __syncthreads();\n",
"\n",
" // Compute partial result\n",
" for (int k = 0; k < {TILE}; k++)\n",
" sum += As[threadIdx.y][k] * Bs[k][threadIdx.x];\n",
"\n",
" __syncthreads();\n",
" }}\n",
"\n",
" if (row < N && col < N)\n",
" C[row * N + col] = sum;\n",
"}}\n",
"'''\n",
"\n",
"# Compile kernel\n",
"matmul_tiled = cp.RawKernel(kernel_code, 'matmul_tiled')\n"
],
"metadata": {
"id": "UrEyT6Iuce7X"
},
"execution_count": 8,
"outputs": []
},
{
"cell_type": "code",
"source": [
"block = (TILE, TILE)\n",
"grid = (\n",
" (N + TILE - 1) // TILE,\n",
" (N + TILE - 1) // TILE\n",
")\n",
"\n",
"cp.cuda.Stream.null.synchronize()\n",
"start = time.time()\n",
"\n",
"matmul_tiled(grid, block, (A, B, C, N))\n",
"\n",
"cp.cuda.Stream.null.synchronize()\n",
"end = time.time()\n",
"\n",
"print(\"Tiled kernel execution time:\", end - start, \"seconds\")\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "aBrJs2Txcrrn",
"outputId": "f8b6b81b-58d3-49b2-dfbc-953de6a32874"
},
"execution_count": 10,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Tiled kernel execution time: 0.0007283687591552734 seconds\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"better preformance overall"
],
"metadata": {
"id": "rOQdAkTCc2M-"
}
}
]
}
\ No newline at end of file
To run the code just open the ipynb in colab or jupter notebook and execute each cell
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment