"#The function cp.cuda.Stream.null.synchronize() blocks the CPU (host) until all previously scheduled operations in the default CUDA stream on the GPU have completed\n",
"cp.cuda.Stream.null.synchronize()\n",
"\n",
"# timing\n",
"start_time = time.time()\n",
"\n",
"# using the matmul built in function\n",
"C_gpu = cp.matmul(A_gpu, B_gpu)\n",
"\n",
"\n",
"cp.cuda.Stream.null.synchronize()\n",
"\n",
"# timing\n",
"end_time = time.time()\n",
"\n",
"# calc time\n",
"execution_time = end_time - start_time\n",
"\n",
"print(\"Matrix size:\", N, \"x\", N)\n",
"print(\"Execution time (GPU using CuPy):\", execution_time, \"seconds\")\n"
"here we can see that the custom c kernel took 0.0509185791015625 and the built in function took longer \"0.1387937068939209\" i kinda expected that because c code is faster but to google the built in function should be faster because it has tiling"
],
"metadata": {
"id": "HwQbN7IsZDFk"
}
},
{
"cell_type": "markdown",
"source": [],
"metadata": {
"id": "gX_xjXcsaxTd"
}
},
{
"cell_type": "markdown",
"source": [],
"metadata": {
"id": "LiK1xHoPaxQr"
}
},
{
"cell_type": "markdown",
"source": [
"before proceeding i should know what is tiling\n",
"According to ChatGPT \"The core idea (one sentence)\n",
"\n",
"Tiling = cut a big problem into small pieces that fit in fast memory, reuse them many times, then move to the next piece.\""
],
"metadata": {
"id": "G2gjndpiZvna"
}
},
{
"cell_type": "code",
"source": [
"# Qesution 5\n",
"\n",
"import cupy as cp\n",
"import time\n",
"\n",
"\n",
"N = 300\n",
"TILE = 16\n",
"\n",
"A = cp.random.rand(N, N).astype(cp.float32)\n",
"B = cp.random.rand(N, N).astype(cp.float32)\n",
"C = cp.zeros((N, N), dtype=cp.float32)\n",
"\n",
"# tiled kernel\n",
"kernel_code = rf'''\n",
"extern \"C\" __global__\n",
"void matmul_tiled(const float* A, const float* B, float* C, int N) {{\n",
"\n",
" __shared__ float As[{TILE}][{TILE}];\n",
" __shared__ float Bs[{TILE}][{TILE}];\n",
"\n",
" int row = blockIdx.y * blockDim.y + threadIdx.y;\n",
" int col = blockIdx.x * blockDim.x + threadIdx.x;\n",
"\n",
" float sum = 0.0f;\n",
"\n",
" for (int t = 0; t < (N + {TILE} - 1) / {TILE}; t++) {{\n",
"\n",
" int A_col = t * {TILE} + threadIdx.x;\n",
" int B_row = t * {TILE} + threadIdx.y;\n",
"\n",
" // Load tiles into shared memory\n",
" if (row < N && A_col < N)\n",
" As[threadIdx.y][threadIdx.x] = A[row * N + A_col];\n",
" else\n",
" As[threadIdx.y][threadIdx.x] = 0.0f;\n",
"\n",
" if (B_row < N && col < N)\n",
" Bs[threadIdx.y][threadIdx.x] = B[B_row * N + col];\n",
" else\n",
" Bs[threadIdx.y][threadIdx.x] = 0.0f;\n",
"\n",
" __syncthreads();\n",
"\n",
" // Compute partial result\n",
" for (int k = 0; k < {TILE}; k++)\n",
" sum += As[threadIdx.y][k] * Bs[k][threadIdx.x];\n",