Update

ecfecf8c · mhdbashard · de1bea2e · ecfecf8c · de1bea2e · ecfecf8c
Commit ecfecf8c authored Jan 14, 2026 by mhdbashard
Showing with 655 additions and 624 deletions

GPU-Lab C Kernel.ipynb GPU-Lab C Kernel.ipynb +564 -56

GPU-Lab CKernel.ipynb.ipynb GPU-Lab CKernel.ipynb.ipynb +0 -568

GPU-Lab Py Kernel.ipynb GPU-Lab Py Kernel.ipynb +60 -0

README.md README.md +31 -0

No files found.
--- a/GPU-Lab C Kernel.ipynb
+++ b/GPU-Lab C Kernel.ipynb
 {
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b6d3aebb",
-   "metadata": {
-    "vscode": {
-     "languageId": "plaintext"
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "5TxHrnfxMYtK"
+      },
+      "source": [
+        "ref: https://cuda-tutorial.readthedocs.io/en/latest/tutorials/tutorial01/"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "bz0VYqONOBe1",
+        "outputId": "ef1b5673-0037-42a1-d1b6-44ca770bcec7"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "nvcc: NVIDIA (R) Cuda compiler driver\n",
+            "Copyright (c) 2005-2023 NVIDIA Corporation\n",
+            "Built on Tue_Aug_15_22:02:13_PDT_2023\n",
+            "Cuda compilation tools, release 12.2, V12.2.140\n",
+            "Build cuda_12.2.r12.2/compiler.33191640_0\n"
+          ]
+        }
+      ],
+      "source": [
+        "!nvcc --version"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "y5UasDOL-GN1",
+        "outputId": "e419a790-c080-4ff3-fcaf-2e7f9879f163"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Writing add1.cu\n"
+          ]
+        }
+      ],
+      "source": [
+        "%%writefile add1.cu\n",
+        "#include <stdio.h>\n",
+        "#include <stdlib.h>\n",
+        "__global__ void add(int *a, int *b) {\n",
+        "a[0]+= b[0];\n",
+        "}\n",
+        "\n",
+        "int main() {\n",
+        "int a, b;\n",
+        "\n",
+        "// host copies of variables a, b\n",
+        "int *d_a, *d_b;\n",
+        "\n",
+        "// device copies of variables a, b\n",
+        "int size = sizeof(int);\n",
+        "\n",
+        "// Allocate space for device copies of a, b\n",
+        "cudaMalloc(&d_a, size);\n",
+        "cudaMalloc(&d_b, size);\n",
+        "\n",
+        "// Setup input values\n",
+        "a = 5;\n",
+        "b = 100;\n",
+        "\n",
+        "// Copy inputs to device\n",
+        "cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice);\n",
+        "cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);\n",
+        "\n",
+        "// Launch add() kernel on GPU\n",
+        "add<<<1,1>>>(d_a, d_b);\n",
+        "\n",
+        "// Copy result back to host\n",
+        "cudaError err = cudaMemcpy(&a, d_a, size, cudaMemcpyDeviceToHost);\n",
+        "  if(err!=cudaSuccess) {\n",
+        "      printf(\"CUDA error copying to Host: %s\\n\", cudaGetErrorString(err));\n",
+        "  }\n",
+        "printf(\"result is %d\\n\",a);\n",
+        "\n",
+        "// Cleanup\n",
+        "cudaFree(d_a);\n",
+        "cudaFree(d_b);\n",
+        "\n",
+        "return 0;\n",
+        "}"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "i4mVKFx97KTx",
+        "outputId": "bc0cb70b-7f84-44ca-c5cd-7e9fc648065c"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "==4619== NVPROF is profiling process 4619, command: ./out\n",
+            "result is 105\n",
+            "==4619== Profiling application: ./out\n",
+            "==4619== Profiling result:\n",
+            "            Type  Time(%)      Time     Calls       Avg       Min       Max  Name\n",
+            " GPU activities:   46.26%  3.3600us         1  3.3600us  3.3600us  3.3600us  add(int*, int*)\n",
+            "                   28.63%  2.0800us         1  2.0800us  2.0800us  2.0800us  [CUDA memcpy DtoH]\n",
+            "                   25.11%  1.8240us         2     912ns     640ns  1.1840us  [CUDA memcpy HtoD]\n",
+            "      API calls:   53.54%  150.98ms         2  75.492ms  7.5110us  150.98ms  cudaMalloc\n",
+            "                   46.28%  130.52ms         1  130.52ms  130.52ms  130.52ms  cudaLaunchKernel\n",
+            "                    0.08%  215.40us         2  107.70us  34.649us  180.76us  cudaFree\n",
+            "                    0.07%  194.04us       114  1.7020us     258ns  75.755us  cuDeviceGetAttribute\n",
+            "                    0.03%  72.888us         3  24.296us  6.7490us  38.437us  cudaMemcpy\n",
+            "                    0.00%  11.698us         1  11.698us  11.698us  11.698us  cuDeviceGetName\n",
+            "                    0.00%  8.3590us         1  8.3590us  8.3590us  8.3590us  cuDeviceGetPCIBusId\n",
+            "                    0.00%  5.0250us         1  5.0250us  5.0250us  5.0250us  cuDeviceTotalMem\n",
+            "                    0.00%  1.9900us         3     663ns     339ns  1.3000us  cuDeviceGetCount\n",
+            "                    0.00%  1.0380us         2     519ns     315ns     723ns  cuDeviceGet\n",
+            "                    0.00%     642ns         1     642ns     642ns     642ns  cuModuleGetLoadingMode\n",
+            "                    0.00%     357ns         1     357ns     357ns     357ns  cuDeviceGetUuid\n"
+          ]
+        }
+      ],
+      "source": [
+        "!nvcc add1.cu -o out\n",
+        "!nvprof ./out"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 34
+        },
+        "id": "xQMHTBWR7x4o",
+        "outputId": "71790427-a2b9-4f49-dde0-ed1bb2043b53"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Overwriting add2.cu\n"
+          ]
+        }
+      ],
+      "source": [
+        "%%writefile add2.cu\n",
+        "#include <stdio.h>\n",
+        "\n",
+        "#define N 10000\n",
+        "\n",
+        "\n",
+        "__global__ void vector_add(float *out, float *a, float *b, int n) {\n",
+        "    for(int i = 0; i < n; i += 1){\n",
+        "        out[i] = a[i] + b[i];\n",
+        "    }\n",
+        "}\n",
+        "\n",
+        "int main(){\n",
+        "    float *a, *b, *out;\n",
+        "    float *d_a, *d_b, *d_out;\n",
+        "\n",
+        "    // Allocate host memory\n",
+        "    a   = (float*)malloc(sizeof(float) * N);\n",
+        "    b   = (float*)malloc(sizeof(float) * N);\n",
+        "    out = (float*)malloc(sizeof(float) * N);\n",
+        "\n",
+        "    // Initialize host arrays\n",
+        "    for(int i = 0; i < N; i++){\n",
+        "        a[i] = i;\n",
+        "        b[i] = 5;\n",
+        "    }\n",
+        "\n",
+        "    // Allocate device memory\n",
+        "    cudaMalloc((void**)&d_a, sizeof(float) * N);\n",
+        "    cudaMalloc((void**)&d_b, sizeof(float) * N);\n",
+        "    cudaMalloc((void**)&d_out, sizeof(float) * N);\n",
+        "\n",
+        "    // Transfer data from host to device memory\n",
+        "    cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice);\n",
+        "    cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice);\n",
+        "\n",
+        "    // Executing kernel\n",
+        "    vector_add<<<1,256>>>(d_out, d_a, d_b, N);\n",
+        "\n",
+        "    // Transfer data back to host memory\n",
+        "    cudaMemcpy(out, d_out, sizeof(float) * N, cudaMemcpyDeviceToHost);\n",
+        "\n",
+        "    // Verification\n",
+        "\n",
+        "\n",
+        "\n",
+        "\n",
+        "    // Deallocate device memory\n",
+        "    cudaFree(d_a);\n",
+        "    cudaFree(d_b);\n",
+        "    cudaFree(d_out);\n",
+        "\n",
+        "    // Deallocate host memory\n",
+        "    free(a);\n",
+        "    free(b);\n",
+        "    free(out);\n",
+        "}\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 319
+        },
+        "id": "14oI7-QW8EPy",
+        "outputId": "f2648648-a7a3-49ca-9750-70ba060441cd"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "==596== NVPROF is profiling process 596, command: ./out\n",
+            "==596== Profiling application: ./out\n",
+            "==596== Profiling result:\n",
+            "            Type  Time(%)      Time     Calls       Avg       Min       Max  Name\n",
+            " GPU activities:   98.52%  1.0440ms         1  1.0440ms  1.0440ms  1.0440ms  vector_add(float*, float*, float*, int)\n",
+            "                    1.09%  11.550us         2  5.7750us  5.7590us  5.7910us  [CUDA memcpy HtoD]\n",
+            "                    0.39%  4.0960us         1  4.0960us  4.0960us  4.0960us  [CUDA memcpy DtoH]\n",
+            "      API calls:   99.00%  176.44ms         3  58.814ms  6.7960us  176.43ms  cudaMalloc\n",
+            "                    0.65%  1.1567ms         3  385.58us  26.484us  1.0936ms  cudaMemcpy\n",
+            "                    0.20%  353.24us         1  353.24us  353.24us  353.24us  cuDeviceTotalMem\n",
+            "                    0.07%  131.50us        96  1.3690us     127ns  47.376us  cuDeviceGetAttribute\n",
+            "                    0.05%  94.987us         3  31.662us  4.9660us  79.056us  cudaFree\n",
+            "                    0.02%  27.053us         1  27.053us  27.053us  27.053us  cudaLaunchKernel\n",
+            "                    0.01%  15.460us         1  15.460us  15.460us  15.460us  cuDeviceGetName\n",
+            "                    0.00%  2.6340us         1  2.6340us  2.6340us  2.6340us  cuDeviceGetPCIBusId\n",
+            "                    0.00%  1.7800us         3     593ns     186ns  1.0060us  cuDeviceGetCount\n",
+            "                    0.00%     827ns         2     413ns     248ns     579ns  cuDeviceGet\n",
+            "                    0.00%     255ns         1     255ns     255ns     255ns  cuDeviceGetUuid\n"
+          ]
+        }
+      ],
+      "source": [
+        "!nvcc add2.cu -o out\n",
+        "!nvprof ./out"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 34
+        },
+        "id": "bgHifWBCOmms",
+        "outputId": "53a3e591-7a07-4e2f-ad7e-265ad9c63a39"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Writing add3.cu\n"
+          ]
+        }
+      ],
+      "source": [
+        "%%writefile add3.cu\n",
+        "#include <stdio.h>\n",
+        "\n",
+        "#define N 10000\n",
+        "#define MAX_ER 1e-6\n",
+        "\n",
+        "__global__ void vector_add(float *out, float *a, float *b, int n) {\n",
+        "    int index = threadIdx.x;\n",
+        "    int stride = blockDim.x;\n",
+        "\n",
+        "    for(int i = index; i < n; i += stride){\n",
+        "        out[i] = a[i] + b[i];\n",
+        "      }\n",
+        "}\n",
+        "\n",
+        "int main(){\n",
+        "    float *a, *b, *out;\n",
+        "    float *d_a, *d_b, *d_out;\n",
+        "\n",
+        "    // Allocate host memory\n",
+        "    a   = (float*)malloc(sizeof(float) * N);\n",
+        "    b   = (float*)malloc(sizeof(float) * N);\n",
+        "    out = (float*)malloc(sizeof(float) * N);\n",
+        "\n",
+        "    // Initialize host arrays\n",
+        "    for(int i = 0; i < N; i++){\n",
+        "        a[i] = i+1;\n",
+        "        b[i] = 26;\n",
+        "    }\n",
+        "\n",
+        "    // Allocate device memory\n",
+        "    cudaMalloc((void**)&d_a, sizeof(float) * N);\n",
+        "    cudaMalloc((void**)&d_b, sizeof(float) * N);\n",
+        "    cudaMalloc((void**)&d_out, sizeof(float) * N);\n",
+        "\n",
+        "    // Transfer data from host to device memory\n",
+        "    cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice);\n",
+        "    cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice);\n",
+        "\n",
+        "    // Executing kernel\n",
+        "    vector_add<<<1,512>>>(d_out, d_a, d_b, N);\n",
+        "\n",
+        "    // Transfer data back to host memory\n",
+        "    cudaMemcpy(out, d_out, sizeof(float) * N, cudaMemcpyDeviceToHost);\n",
+        "\n",
+        "    // Verification\n",
+        "    //for(int i = 0; i < N; i++){\n",
+        "  //      printf(\"%f\\n\",out[i]);\n",
+        "//    }\n",
+        "\n",
+        "\n",
+        "\n",
+        "    // Deallocate device memory\n",
+        "    cudaFree(d_a);\n",
+        "    cudaFree(d_b);\n",
+        "    cudaFree(d_out);\n",
+        "\n",
+        "    // Deallocate host memory\n",
+        "    free(a);\n",
+        "    free(b);\n",
+        "    free(out);\n",
+        "}\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 34
+        },
+        "id": "hqZBkrd8x0k4",
+        "outputId": "48dfa7c6-35ed-43c0-b029-c30e810be629"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Linux\n"
+          ]
+        }
+      ],
+      "source": [
+        "! uname"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 319
+        },
+        "id": "tj8bqWhN6_ei",
+        "outputId": "bd822a17-d13a-486f-d111-ea3d300dfdbb"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "==186== NVPROF is profiling process 186, command: ./out\n",
+            "==186== Profiling application: ./out\n",
+            "==186== Profiling result:\n",
+            "            Type  Time(%)      Time     Calls       Avg       Min       Max  Name\n",
+            " GPU activities:   50.83%  14.591us         2  7.2950us  7.2950us  7.2960us  [CUDA memcpy HtoD]\n",
+            "                   34.45%  9.8880us         1  9.8880us  9.8880us  9.8880us  vector_add(float*, float*, float*, int)\n",
+            "                   14.72%  4.2240us         1  4.2240us  4.2240us  4.2240us  [CUDA memcpy DtoH]\n",
+            "      API calls:   99.74%  261.61ms         3  87.202ms  5.2700us  261.59ms  cudaMalloc\n",
+            "                    0.09%  239.89us        96  2.4980us     127ns  159.76us  cuDeviceGetAttribute\n",
+            "                    0.07%  184.03us         1  184.03us  184.03us  184.03us  cuDeviceTotalMem\n",
+            "                    0.04%  104.40us         3  34.800us  28.086us  48.080us  cudaMemcpy\n",
+            "                    0.03%  87.576us         3  29.192us  5.0130us  72.747us  cudaFree\n",
+            "                    0.01%  27.161us         1  27.161us  27.161us  27.161us  cudaLaunchKernel\n",
+            "                    0.01%  21.881us         1  21.881us  21.881us  21.881us  cuDeviceGetName\n",
+            "                    0.00%  3.0560us         1  3.0560us  3.0560us  3.0560us  cuDeviceGetPCIBusId\n",
+            "                    0.00%  1.6590us         3     553ns     140ns  1.2640us  cuDeviceGetCount\n",
+            "                    0.00%  1.2970us         2     648ns     241ns  1.0560us  cuDeviceGet\n",
+            "                    0.00%     251ns         1     251ns     251ns     251ns  cuDeviceGetUuid\n"
+          ]
+        }
+      ],
+      "source": [
+        "!nvcc add3.cu -o out\n",
+        "!nvprof ./out"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "7Azzx2U3_DPG",
+        "outputId": "db632a58-afc9-4c5e-d53a-8639e6b9b287"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Overwriting add4.cu\n"
+          ]
+        }
+      ],
+      "source": [
+        "%%writefile add4.cu\n",
+        "#include <stdio.h>\n",
+        "\n",
+        "#define N 10000\n",
+        "\n",
+        "__global__ void vector_add(float *out, float *a, float *b, int n) {\n",
+        "    int tid = blockIdx.x * blockDim.x + threadIdx.x;\n",
+        "        out[tid] = a[tid] + b[tid];\n",
+        "\n",
+        "}\n",
+        "\n",
+        "int main(){\n",
+        "    float *a, *b, *out;\n",
+        "    float *d_a, *d_b, *d_out;\n",
+        "\n",
+        "    // Allocate host memory\n",
+        "    a   = (float*)malloc(sizeof(float) * N);\n",
+        "    b   = (float*)malloc(sizeof(float) * N);\n",
+        "    out = (float*)malloc(sizeof(float) * N);\n",
+        "\n",
+        "    // Initialize host arrays\n",
+        "    for(int i = 0; i < N; i++){\n",
+        "        a[i] = i+1;\n",
+        "        b[i] = 26;\n",
+        "    }\n",
+        "\n",
+        "    // Allocate device memory\n",
+        "    cudaMalloc((void**)&d_a, sizeof(float) * N);\n",
+        "    cudaMalloc((void**)&d_b, sizeof(float) * N);\n",
+        "    cudaMalloc((void**)&d_out, sizeof(float) * N);\n",
+        "\n",
+        "    // Transfer data from host to device memory\n",
+        "    cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice);\n",
+        "    cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice);\n",
+        "\n",
+        "    // Executing kernel\n",
+        "     int block_size = 32;\n",
+        "    int grid_size = ((N + block_size) / block_size);\n",
+        "    vector_add<<<grid_size,block_size>>>(d_out, d_a, d_b, N);\n",
+        "\n",
+        "    // Transfer data back to host memory\n",
+        "    cudaMemcpy(out, d_out, sizeof(float) * N, cudaMemcpyDeviceToHost);\n",
+        "\n",
+        "    // Verification\n",
+        "    //for(int i = 0; i < N; i++){\n",
+        "  //      printf(\"%f\\n\",out[i]);\n",
+        "//    }\n",
+        "\n",
+        "\n",
+        "\n",
+        "    // Deallocate device memory\n",
+        "    cudaFree(d_a);\n",
+        "    cudaFree(d_b);\n",
+        "    cudaFree(d_out);\n",
+        "\n",
+        "    // Deallocate host memory\n",
+        "    free(a);\n",
+        "    free(b);\n",
+        "    free(out);\n",
+        "}\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "6rUKHYj-_hle",
+        "outputId": "9615f4ee-05a7-4a1c-da7c-2431745fd9b9"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "==2018== NVPROF is profiling process 2018, command: ./out\n",
+            "==2018== Profiling application: ./out\n",
+            "==2018== Profiling result:\n",
+            "            Type  Time(%)      Time     Calls       Avg       Min       Max  Name\n",
+            " GPU activities:   54.56%  10.720us         2  5.3600us  5.2160us  5.5040us  [CUDA memcpy HtoD]\n",
+            "                   25.08%  4.9280us         1  4.9280us  4.9280us  4.9280us  [CUDA memcpy DtoH]\n",
+            "                   20.36%  4.0000us         1  4.0000us  4.0000us  4.0000us  vector_add(float*, float*, float*, int)\n",
+            "      API calls:   99.68%  203.91ms         3  67.971ms  5.1070us  203.90ms  cudaMalloc\n",
+            "                    0.11%  214.95us         1  214.95us  214.95us  214.95us  cudaLaunchKernel\n",
+            "                    0.07%  137.26us         3  45.753us  5.9300us  121.30us  cudaFree\n",
+            "                    0.07%  134.98us       114  1.1840us     144ns  53.102us  cuDeviceGetAttribute\n",
+            "                    0.06%  132.93us         3  44.310us  28.425us  62.880us  cudaMemcpy\n",
+            "                    0.01%  12.321us         1  12.321us  12.321us  12.321us  cuDeviceGetName\n",
+            "                    0.00%  5.5190us         1  5.5190us  5.5190us  5.5190us  cuDeviceTotalMem\n",
+            "                    0.00%  5.2280us         1  5.2280us  5.2280us  5.2280us  cuDeviceGetPCIBusId\n",
+            "                    0.00%  1.8590us         3     619ns     237ns  1.3680us  cuDeviceGetCount\n",
+            "                    0.00%  1.1470us         2     573ns     173ns     974ns  cuDeviceGet\n",
+            "                    0.00%     708ns         1     708ns     708ns     708ns  cuModuleGetLoadingMode\n",
+            "                    0.00%     228ns         1     228ns     228ns     228ns  cuDeviceGetUuid\n"
+          ]
+        }
+      ],
+      "source": [
+        "!nvcc add4.cu -o out\n",
+        "!nvprof ./out"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "fvbifMBvgpMX"
+      },
+      "outputs": [],
+      "source": []
    }
-   },
-   "outputs": [],
-   "source": [
-    "ref:https://cuda-tutorial.readthedocs.io/en/latest/tutorials/tutorial01/"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c8eff613",
-   "metadata": {
-    "vscode": {
-     "languageId": "plaintext"
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
    }
-   },
-   "outputs": [],
-   "source": [
-    "import cupy\n",
-    "\n",
-    "# size of the vectors\n",
-    "size = 1024\n",
-    "\n",
-    "# allocating and populating the vectors\n",
-    "a_gpu = cupy.random.rand(size, dtype=cupy.float32)\n",
-    "b_gpu = cupy.random.rand(size, dtype=cupy.float32)\n",
-    "c_gpu = cupy.zeros(size, dtype=cupy.float32)\n",
-    "\n",
-    "# CUDA vector_add\n",
-    "vector_add_cuda_code = r'''\n",
-    "extern \"C\"\n",
-    "__global__ void vector_add(const float * A, const float * B, float * C, const int size)\n",
-    "{\n",
-    "    int item = threadIdx.x;\n",
-    "    C[item] = A[item] + B[item];\n",
-    "}\n",
-    "'''\n",
-    "vector_add_gpu = cupy.RawKernel(vector_add_cuda_code, \"vector_add\")\n",
-    "\n",
-    "vector_add_gpu((1, 1, 1), (size, 1, 1), (a_gpu, b_gpu, c_gpu, size))"
-   ]
-  }
- ],
- "metadata": {
-  "language_info": {
-   "name": "python"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
 }
--- a/GPU-Lab CKernel.ipynb.ipynb
+++ b/GPU-Lab CKernel.ipynb.ipynb
-{
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "5TxHrnfxMYtK"
-      },
-      "source": [
-        "ref: https://cuda-tutorial.readthedocs.io/en/latest/tutorials/tutorial01/"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "bz0VYqONOBe1",
-        "outputId": "ef1b5673-0037-42a1-d1b6-44ca770bcec7"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "nvcc: NVIDIA (R) Cuda compiler driver\n",
-            "Copyright (c) 2005-2023 NVIDIA Corporation\n",
-            "Built on Tue_Aug_15_22:02:13_PDT_2023\n",
-            "Cuda compilation tools, release 12.2, V12.2.140\n",
-            "Build cuda_12.2.r12.2/compiler.33191640_0\n"
-          ]
-        }
-      ],
-      "source": [
-        "!nvcc --version"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "y5UasDOL-GN1",
-        "outputId": "e419a790-c080-4ff3-fcaf-2e7f9879f163"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Writing add1.cu\n"
-          ]
-        }
-      ],
-      "source": [
-        "%%writefile add1.cu\n",
-        "#include <stdio.h>\n",
-        "#include <stdlib.h>\n",
-        "__global__ void add(int *a, int *b) {\n",
-        "a[0]+= b[0];\n",
-        "}\n",
-        "\n",
-        "int main() {\n",
-        "int a, b;\n",
-        "\n",
-        "// host copies of variables a, b\n",
-        "int *d_a, *d_b;\n",
-        "\n",
-        "// device copies of variables a, b\n",
-        "int size = sizeof(int);\n",
-        "\n",
-        "// Allocate space for device copies of a, b\n",
-        "cudaMalloc(&d_a, size);\n",
-        "cudaMalloc(&d_b, size);\n",
-        "\n",
-        "// Setup input values\n",
-        "a = 5;\n",
-        "b = 100;\n",
-        "\n",
-        "// Copy inputs to device\n",
-        "cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice);\n",
-        "cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);\n",
-        "\n",
-        "// Launch add() kernel on GPU\n",
-        "add<<<1,1>>>(d_a, d_b);\n",
-        "\n",
-        "// Copy result back to host\n",
-        "cudaError err = cudaMemcpy(&a, d_a, size, cudaMemcpyDeviceToHost);\n",
-        "  if(err!=cudaSuccess) {\n",
-        "      printf(\"CUDA error copying to Host: %s\\n\", cudaGetErrorString(err));\n",
-        "  }\n",
-        "printf(\"result is %d\\n\",a);\n",
-        "\n",
-        "// Cleanup\n",
-        "cudaFree(d_a);\n",
-        "cudaFree(d_b);\n",
-        "\n",
-        "return 0;\n",
-        "}"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "i4mVKFx97KTx",
-        "outputId": "bc0cb70b-7f84-44ca-c5cd-7e9fc648065c"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "==4619== NVPROF is profiling process 4619, command: ./out\n",
-            "result is 105\n",
-            "==4619== Profiling application: ./out\n",
-            "==4619== Profiling result:\n",
-            "            Type  Time(%)      Time     Calls       Avg       Min       Max  Name\n",
-            " GPU activities:   46.26%  3.3600us         1  3.3600us  3.3600us  3.3600us  add(int*, int*)\n",
-            "                   28.63%  2.0800us         1  2.0800us  2.0800us  2.0800us  [CUDA memcpy DtoH]\n",
-            "                   25.11%  1.8240us         2     912ns     640ns  1.1840us  [CUDA memcpy HtoD]\n",
-            "      API calls:   53.54%  150.98ms         2  75.492ms  7.5110us  150.98ms  cudaMalloc\n",
-            "                   46.28%  130.52ms         1  130.52ms  130.52ms  130.52ms  cudaLaunchKernel\n",
-            "                    0.08%  215.40us         2  107.70us  34.649us  180.76us  cudaFree\n",
-            "                    0.07%  194.04us       114  1.7020us     258ns  75.755us  cuDeviceGetAttribute\n",
-            "                    0.03%  72.888us         3  24.296us  6.7490us  38.437us  cudaMemcpy\n",
-            "                    0.00%  11.698us         1  11.698us  11.698us  11.698us  cuDeviceGetName\n",
-            "                    0.00%  8.3590us         1  8.3590us  8.3590us  8.3590us  cuDeviceGetPCIBusId\n",
-            "                    0.00%  5.0250us         1  5.0250us  5.0250us  5.0250us  cuDeviceTotalMem\n",
-            "                    0.00%  1.9900us         3     663ns     339ns  1.3000us  cuDeviceGetCount\n",
-            "                    0.00%  1.0380us         2     519ns     315ns     723ns  cuDeviceGet\n",
-            "                    0.00%     642ns         1     642ns     642ns     642ns  cuModuleGetLoadingMode\n",
-            "                    0.00%     357ns         1     357ns     357ns     357ns  cuDeviceGetUuid\n"
-          ]
-        }
-      ],
-      "source": [
-        "!nvcc add1.cu -o out\n",
-        "!nvprof ./out"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 34
-        },
-        "id": "xQMHTBWR7x4o",
-        "outputId": "71790427-a2b9-4f49-dde0-ed1bb2043b53"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Overwriting add2.cu\n"
-          ]
-        }
-      ],
-      "source": [
-        "%%writefile add2.cu\n",
-        "#include <stdio.h>\n",
-        "\n",
-        "#define N 10000\n",
-        "\n",
-        "\n",
-        "__global__ void vector_add(float *out, float *a, float *b, int n) {\n",
-        "    for(int i = 0; i < n; i += 1){\n",
-        "        out[i] = a[i] + b[i];\n",
-        "    }\n",
-        "}\n",
-        "\n",
-        "int main(){\n",
-        "    float *a, *b, *out;\n",
-        "    float *d_a, *d_b, *d_out;\n",
-        "\n",
-        "    // Allocate host memory\n",
-        "    a   = (float*)malloc(sizeof(float) * N);\n",
-        "    b   = (float*)malloc(sizeof(float) * N);\n",
-        "    out = (float*)malloc(sizeof(float) * N);\n",
-        "\n",
-        "    // Initialize host arrays\n",
-        "    for(int i = 0; i < N; i++){\n",
-        "        a[i] = i;\n",
-        "        b[i] = 5;\n",
-        "    }\n",
-        "\n",
-        "    // Allocate device memory\n",
-        "    cudaMalloc((void**)&d_a, sizeof(float) * N);\n",
-        "    cudaMalloc((void**)&d_b, sizeof(float) * N);\n",
-        "    cudaMalloc((void**)&d_out, sizeof(float) * N);\n",
-        "\n",
-        "    // Transfer data from host to device memory\n",
-        "    cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice);\n",
-        "    cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice);\n",
-        "\n",
-        "    // Executing kernel\n",
-        "    vector_add<<<1,256>>>(d_out, d_a, d_b, N);\n",
-        "\n",
-        "    // Transfer data back to host memory\n",
-        "    cudaMemcpy(out, d_out, sizeof(float) * N, cudaMemcpyDeviceToHost);\n",
-        "\n",
-        "    // Verification\n",
-        "\n",
-        "\n",
-        "\n",
-        "\n",
-        "    // Deallocate device memory\n",
-        "    cudaFree(d_a);\n",
-        "    cudaFree(d_b);\n",
-        "    cudaFree(d_out);\n",
-        "\n",
-        "    // Deallocate host memory\n",
-        "    free(a);\n",
-        "    free(b);\n",
-        "    free(out);\n",
-        "}\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 319
-        },
-        "id": "14oI7-QW8EPy",
-        "outputId": "f2648648-a7a3-49ca-9750-70ba060441cd"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "==596== NVPROF is profiling process 596, command: ./out\n",
-            "==596== Profiling application: ./out\n",
-            "==596== Profiling result:\n",
-            "            Type  Time(%)      Time     Calls       Avg       Min       Max  Name\n",
-            " GPU activities:   98.52%  1.0440ms         1  1.0440ms  1.0440ms  1.0440ms  vector_add(float*, float*, float*, int)\n",
-            "                    1.09%  11.550us         2  5.7750us  5.7590us  5.7910us  [CUDA memcpy HtoD]\n",
-            "                    0.39%  4.0960us         1  4.0960us  4.0960us  4.0960us  [CUDA memcpy DtoH]\n",
-            "      API calls:   99.00%  176.44ms         3  58.814ms  6.7960us  176.43ms  cudaMalloc\n",
-            "                    0.65%  1.1567ms         3  385.58us  26.484us  1.0936ms  cudaMemcpy\n",
-            "                    0.20%  353.24us         1  353.24us  353.24us  353.24us  cuDeviceTotalMem\n",
-            "                    0.07%  131.50us        96  1.3690us     127ns  47.376us  cuDeviceGetAttribute\n",
-            "                    0.05%  94.987us         3  31.662us  4.9660us  79.056us  cudaFree\n",
-            "                    0.02%  27.053us         1  27.053us  27.053us  27.053us  cudaLaunchKernel\n",
-            "                    0.01%  15.460us         1  15.460us  15.460us  15.460us  cuDeviceGetName\n",
-            "                    0.00%  2.6340us         1  2.6340us  2.6340us  2.6340us  cuDeviceGetPCIBusId\n",
-            "                    0.00%  1.7800us         3     593ns     186ns  1.0060us  cuDeviceGetCount\n",
-            "                    0.00%     827ns         2     413ns     248ns     579ns  cuDeviceGet\n",
-            "                    0.00%     255ns         1     255ns     255ns     255ns  cuDeviceGetUuid\n"
-          ]
-        }
-      ],
-      "source": [
-        "!nvcc add2.cu -o out\n",
-        "!nvprof ./out"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 34
-        },
-        "id": "bgHifWBCOmms",
-        "outputId": "53a3e591-7a07-4e2f-ad7e-265ad9c63a39"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Writing add3.cu\n"
-          ]
-        }
-      ],
-      "source": [
-        "%%writefile add3.cu\n",
-        "#include <stdio.h>\n",
-        "\n",
-        "#define N 10000\n",
-        "#define MAX_ER 1e-6\n",
-        "\n",
-        "__global__ void vector_add(float *out, float *a, float *b, int n) {\n",
-        "    int index = threadIdx.x;\n",
-        "    int stride = blockDim.x;\n",
-        "\n",
-        "    for(int i = index; i < n; i += stride){\n",
-        "        out[i] = a[i] + b[i];\n",
-        "      }\n",
-        "}\n",
-        "\n",
-        "int main(){\n",
-        "    float *a, *b, *out;\n",
-        "    float *d_a, *d_b, *d_out;\n",
-        "\n",
-        "    // Allocate host memory\n",
-        "    a   = (float*)malloc(sizeof(float) * N);\n",
-        "    b   = (float*)malloc(sizeof(float) * N);\n",
-        "    out = (float*)malloc(sizeof(float) * N);\n",
-        "\n",
-        "    // Initialize host arrays\n",
-        "    for(int i = 0; i < N; i++){\n",
-        "        a[i] = i+1;\n",
-        "        b[i] = 26;\n",
-        "    }\n",
-        "\n",
-        "    // Allocate device memory\n",
-        "    cudaMalloc((void**)&d_a, sizeof(float) * N);\n",
-        "    cudaMalloc((void**)&d_b, sizeof(float) * N);\n",
-        "    cudaMalloc((void**)&d_out, sizeof(float) * N);\n",
-        "\n",
-        "    // Transfer data from host to device memory\n",
-        "    cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice);\n",
-        "    cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice);\n",
-        "\n",
-        "    // Executing kernel\n",
-        "    vector_add<<<1,512>>>(d_out, d_a, d_b, N);\n",
-        "\n",
-        "    // Transfer data back to host memory\n",
-        "    cudaMemcpy(out, d_out, sizeof(float) * N, cudaMemcpyDeviceToHost);\n",
-        "\n",
-        "    // Verification\n",
-        "    //for(int i = 0; i < N; i++){\n",
-        "  //      printf(\"%f\\n\",out[i]);\n",
-        "//    }\n",
-        "\n",
-        "\n",
-        "\n",
-        "    // Deallocate device memory\n",
-        "    cudaFree(d_a);\n",
-        "    cudaFree(d_b);\n",
-        "    cudaFree(d_out);\n",
-        "\n",
-        "    // Deallocate host memory\n",
-        "    free(a);\n",
-        "    free(b);\n",
-        "    free(out);\n",
-        "}\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 34
-        },
-        "id": "hqZBkrd8x0k4",
-        "outputId": "48dfa7c6-35ed-43c0-b029-c30e810be629"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Linux\n"
-          ]
-        }
-      ],
-      "source": [
-        "! uname"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 319
-        },
-        "id": "tj8bqWhN6_ei",
-        "outputId": "bd822a17-d13a-486f-d111-ea3d300dfdbb"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "==186== NVPROF is profiling process 186, command: ./out\n",
-            "==186== Profiling application: ./out\n",
-            "==186== Profiling result:\n",
-            "            Type  Time(%)      Time     Calls       Avg       Min       Max  Name\n",
-            " GPU activities:   50.83%  14.591us         2  7.2950us  7.2950us  7.2960us  [CUDA memcpy HtoD]\n",
-            "                   34.45%  9.8880us         1  9.8880us  9.8880us  9.8880us  vector_add(float*, float*, float*, int)\n",
-            "                   14.72%  4.2240us         1  4.2240us  4.2240us  4.2240us  [CUDA memcpy DtoH]\n",
-            "      API calls:   99.74%  261.61ms         3  87.202ms  5.2700us  261.59ms  cudaMalloc\n",
-            "                    0.09%  239.89us        96  2.4980us     127ns  159.76us  cuDeviceGetAttribute\n",
-            "                    0.07%  184.03us         1  184.03us  184.03us  184.03us  cuDeviceTotalMem\n",
-            "                    0.04%  104.40us         3  34.800us  28.086us  48.080us  cudaMemcpy\n",
-            "                    0.03%  87.576us         3  29.192us  5.0130us  72.747us  cudaFree\n",
-            "                    0.01%  27.161us         1  27.161us  27.161us  27.161us  cudaLaunchKernel\n",
-            "                    0.01%  21.881us         1  21.881us  21.881us  21.881us  cuDeviceGetName\n",
-            "                    0.00%  3.0560us         1  3.0560us  3.0560us  3.0560us  cuDeviceGetPCIBusId\n",
-            "                    0.00%  1.6590us         3     553ns     140ns  1.2640us  cuDeviceGetCount\n",
-            "                    0.00%  1.2970us         2     648ns     241ns  1.0560us  cuDeviceGet\n",
-            "                    0.00%     251ns         1     251ns     251ns     251ns  cuDeviceGetUuid\n"
-          ]
-        }
-      ],
-      "source": [
-        "!nvcc add3.cu -o out\n",
-        "!nvprof ./out"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "7Azzx2U3_DPG",
-        "outputId": "db632a58-afc9-4c5e-d53a-8639e6b9b287"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Overwriting add4.cu\n"
-          ]
-        }
-      ],
-      "source": [
-        "%%writefile add4.cu\n",
-        "#include <stdio.h>\n",
-        "\n",
-        "#define N 10000\n",
-        "\n",
-        "__global__ void vector_add(float *out, float *a, float *b, int n) {\n",
-        "    int tid = blockIdx.x * blockDim.x + threadIdx.x;\n",
-        "        out[tid] = a[tid] + b[tid];\n",
-        "\n",
-        "}\n",
-        "\n",
-        "int main(){\n",
-        "    float *a, *b, *out;\n",
-        "    float *d_a, *d_b, *d_out;\n",
-        "\n",
-        "    // Allocate host memory\n",
-        "    a   = (float*)malloc(sizeof(float) * N);\n",
-        "    b   = (float*)malloc(sizeof(float) * N);\n",
-        "    out = (float*)malloc(sizeof(float) * N);\n",
-        "\n",
-        "    // Initialize host arrays\n",
-        "    for(int i = 0; i < N; i++){\n",
-        "        a[i] = i+1;\n",
-        "        b[i] = 26;\n",
-        "    }\n",
-        "\n",
-        "    // Allocate device memory\n",
-        "    cudaMalloc((void**)&d_a, sizeof(float) * N);\n",
-        "    cudaMalloc((void**)&d_b, sizeof(float) * N);\n",
-        "    cudaMalloc((void**)&d_out, sizeof(float) * N);\n",
-        "\n",
-        "    // Transfer data from host to device memory\n",
-        "    cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice);\n",
-        "    cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice);\n",
-        "\n",
-        "    // Executing kernel\n",
-        "     int block_size = 32;\n",
-        "    int grid_size = ((N + block_size) / block_size);\n",
-        "    vector_add<<<grid_size,block_size>>>(d_out, d_a, d_b, N);\n",
-        "\n",
-        "    // Transfer data back to host memory\n",
-        "    cudaMemcpy(out, d_out, sizeof(float) * N, cudaMemcpyDeviceToHost);\n",
-        "\n",
-        "    // Verification\n",
-        "    //for(int i = 0; i < N; i++){\n",
-        "  //      printf(\"%f\\n\",out[i]);\n",
-        "//    }\n",
-        "\n",
-        "\n",
-        "\n",
-        "    // Deallocate device memory\n",
-        "    cudaFree(d_a);\n",
-        "    cudaFree(d_b);\n",
-        "    cudaFree(d_out);\n",
-        "\n",
-        "    // Deallocate host memory\n",
-        "    free(a);\n",
-        "    free(b);\n",
-        "    free(out);\n",
-        "}\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "6rUKHYj-_hle",
-        "outputId": "9615f4ee-05a7-4a1c-da7c-2431745fd9b9"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "==2018== NVPROF is profiling process 2018, command: ./out\n",
-            "==2018== Profiling application: ./out\n",
-            "==2018== Profiling result:\n",
-            "            Type  Time(%)      Time     Calls       Avg       Min       Max  Name\n",
-            " GPU activities:   54.56%  10.720us         2  5.3600us  5.2160us  5.5040us  [CUDA memcpy HtoD]\n",
-            "                   25.08%  4.9280us         1  4.9280us  4.9280us  4.9280us  [CUDA memcpy DtoH]\n",
-            "                   20.36%  4.0000us         1  4.0000us  4.0000us  4.0000us  vector_add(float*, float*, float*, int)\n",
-            "      API calls:   99.68%  203.91ms         3  67.971ms  5.1070us  203.90ms  cudaMalloc\n",
-            "                    0.11%  214.95us         1  214.95us  214.95us  214.95us  cudaLaunchKernel\n",
-            "                    0.07%  137.26us         3  45.753us  5.9300us  121.30us  cudaFree\n",
-            "                    0.07%  134.98us       114  1.1840us     144ns  53.102us  cuDeviceGetAttribute\n",
-            "                    0.06%  132.93us         3  44.310us  28.425us  62.880us  cudaMemcpy\n",
-            "                    0.01%  12.321us         1  12.321us  12.321us  12.321us  cuDeviceGetName\n",
-            "                    0.00%  5.5190us         1  5.5190us  5.5190us  5.5190us  cuDeviceTotalMem\n",
-            "                    0.00%  5.2280us         1  5.2280us  5.2280us  5.2280us  cuDeviceGetPCIBusId\n",
-            "                    0.00%  1.8590us         3     619ns     237ns  1.3680us  cuDeviceGetCount\n",
-            "                    0.00%  1.1470us         2     573ns     173ns     974ns  cuDeviceGet\n",
-            "                    0.00%     708ns         1     708ns     708ns     708ns  cuModuleGetLoadingMode\n",
-            "                    0.00%     228ns         1     228ns     228ns     228ns  cuDeviceGetUuid\n"
-          ]
-        }
-      ],
-      "source": [
-        "!nvcc add4.cu -o out\n",
-        "!nvprof ./out"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "fvbifMBvgpMX"
-      },
-      "outputs": [],
-      "source": []
-    }
-  ],
-  "metadata": {
-    "accelerator": "GPU",
-    "colab": {
-      "gpuType": "T4",
-      "provenance": []
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
-}
--- a/GPU-Lab Py Kernel.ipynb
+++ b/GPU-Lab Py Kernel.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b6d3aebb",
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "ref:https://cuda-tutorial.readthedocs.io/en/latest/tutorials/tutorial01/"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c8eff613",
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import cupy\n",
+    "\n",
+    "# size of the vectors\n",
+    "size = 1024\n",
+    "\n",
+    "# allocating and populating the vectors\n",
+    "a_gpu = cupy.random.rand(size, dtype=cupy.float32)\n",
+    "b_gpu = cupy.random.rand(size, dtype=cupy.float32)\n",
+    "c_gpu = cupy.zeros(size, dtype=cupy.float32)\n",
+    "\n",
+    "# CUDA vector_add\n",
+    "vector_add_cuda_code = r'''\n",
+    "extern \"C\"\n",
+    "__global__ void vector_add(const float * A, const float * B, float * C, const int size)\n",
+    "{\n",
+    "    int item = threadIdx.x;\n",
+    "    C[item] = A[item] + B[item];\n",
+    "}\n",
+    "'''\n",
+    "vector_add_gpu = cupy.RawKernel(vector_add_cuda_code, \"vector_add\")\n",
+    "\n",
+    "vector_add_gpu((1, 1, 1), (size, 1, 1), (a_gpu, b_gpu, c_gpu, size))"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/README.md
+++ b/README.md
+# GPU Programming
+
+A collection of notebooks for learning and practicing GPU programming with CUDA.
+
+## Contents
+
+### Notebooks
+
+- **GPU-Introduction.ipynb** - Introduction to GPU computing concepts and fundamentals
+- **GPU-Check ENV.ipynb** - Verify and check GPU environment setup and CUDA configuration
+- **GPU-HW CUDA Programming.ipynb** - HomeWrok CUDA programming exercises
+- **GPU-Lab C Kernel.ipynb** - CUDA kernel programming with C language
+- **GPU-Lab Py Kernel.ipynb** - CUDA kernel programming using CuPy in Python
+
+## Overview
+
+This workspace contains practical exercises and examples for GPU programming, including:
+- Vector operations on GPU
+- CUDA kernel implementation and execution
+- GPU memory management
+- Performance optimization techniques
+
+## Requirements
+
+- CUDA Toolkit installed
+- CuPy (for Python CUDA programming)
+- Jupyter notebook environment
+
+## Getting Started
+
+Start with `GPU-Introduction.ipynb` for foundational concepts, then progress through the other notebooks to practice implementing CUDA kernels and GPU operations.