Commit ecfecf8c authored by mhdbashard's avatar mhdbashard

Update

parent de1bea2e
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "b6d3aebb",
"metadata": {
"vscode": {
"languageId": "plaintext"
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "5TxHrnfxMYtK"
},
"source": [
"ref: https://cuda-tutorial.readthedocs.io/en/latest/tutorials/tutorial01/"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "bz0VYqONOBe1",
"outputId": "ef1b5673-0037-42a1-d1b6-44ca770bcec7"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"nvcc: NVIDIA (R) Cuda compiler driver\n",
"Copyright (c) 2005-2023 NVIDIA Corporation\n",
"Built on Tue_Aug_15_22:02:13_PDT_2023\n",
"Cuda compilation tools, release 12.2, V12.2.140\n",
"Build cuda_12.2.r12.2/compiler.33191640_0\n"
]
}
],
"source": [
"!nvcc --version"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "y5UasDOL-GN1",
"outputId": "e419a790-c080-4ff3-fcaf-2e7f9879f163"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Writing add1.cu\n"
]
}
],
"source": [
"%%writefile add1.cu\n",
"#include <stdio.h>\n",
"#include <stdlib.h>\n",
"__global__ void add(int *a, int *b) {\n",
"a[0]+= b[0];\n",
"}\n",
"\n",
"int main() {\n",
"int a, b;\n",
"\n",
"// host copies of variables a, b\n",
"int *d_a, *d_b;\n",
"\n",
"// device copies of variables a, b\n",
"int size = sizeof(int);\n",
"\n",
"// Allocate space for device copies of a, b\n",
"cudaMalloc(&d_a, size);\n",
"cudaMalloc(&d_b, size);\n",
"\n",
"// Setup input values\n",
"a = 5;\n",
"b = 100;\n",
"\n",
"// Copy inputs to device\n",
"cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice);\n",
"cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);\n",
"\n",
"// Launch add() kernel on GPU\n",
"add<<<1,1>>>(d_a, d_b);\n",
"\n",
"// Copy result back to host\n",
"cudaError err = cudaMemcpy(&a, d_a, size, cudaMemcpyDeviceToHost);\n",
" if(err!=cudaSuccess) {\n",
" printf(\"CUDA error copying to Host: %s\\n\", cudaGetErrorString(err));\n",
" }\n",
"printf(\"result is %d\\n\",a);\n",
"\n",
"// Cleanup\n",
"cudaFree(d_a);\n",
"cudaFree(d_b);\n",
"\n",
"return 0;\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "i4mVKFx97KTx",
"outputId": "bc0cb70b-7f84-44ca-c5cd-7e9fc648065c"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"==4619== NVPROF is profiling process 4619, command: ./out\n",
"result is 105\n",
"==4619== Profiling application: ./out\n",
"==4619== Profiling result:\n",
" Type Time(%) Time Calls Avg Min Max Name\n",
" GPU activities: 46.26% 3.3600us 1 3.3600us 3.3600us 3.3600us add(int*, int*)\n",
" 28.63% 2.0800us 1 2.0800us 2.0800us 2.0800us [CUDA memcpy DtoH]\n",
" 25.11% 1.8240us 2 912ns 640ns 1.1840us [CUDA memcpy HtoD]\n",
" API calls: 53.54% 150.98ms 2 75.492ms 7.5110us 150.98ms cudaMalloc\n",
" 46.28% 130.52ms 1 130.52ms 130.52ms 130.52ms cudaLaunchKernel\n",
" 0.08% 215.40us 2 107.70us 34.649us 180.76us cudaFree\n",
" 0.07% 194.04us 114 1.7020us 258ns 75.755us cuDeviceGetAttribute\n",
" 0.03% 72.888us 3 24.296us 6.7490us 38.437us cudaMemcpy\n",
" 0.00% 11.698us 1 11.698us 11.698us 11.698us cuDeviceGetName\n",
" 0.00% 8.3590us 1 8.3590us 8.3590us 8.3590us cuDeviceGetPCIBusId\n",
" 0.00% 5.0250us 1 5.0250us 5.0250us 5.0250us cuDeviceTotalMem\n",
" 0.00% 1.9900us 3 663ns 339ns 1.3000us cuDeviceGetCount\n",
" 0.00% 1.0380us 2 519ns 315ns 723ns cuDeviceGet\n",
" 0.00% 642ns 1 642ns 642ns 642ns cuModuleGetLoadingMode\n",
" 0.00% 357ns 1 357ns 357ns 357ns cuDeviceGetUuid\n"
]
}
],
"source": [
"!nvcc add1.cu -o out\n",
"!nvprof ./out"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"id": "xQMHTBWR7x4o",
"outputId": "71790427-a2b9-4f49-dde0-ed1bb2043b53"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Overwriting add2.cu\n"
]
}
],
"source": [
"%%writefile add2.cu\n",
"#include <stdio.h>\n",
"\n",
"#define N 10000\n",
"\n",
"\n",
"__global__ void vector_add(float *out, float *a, float *b, int n) {\n",
" for(int i = 0; i < n; i += 1){\n",
" out[i] = a[i] + b[i];\n",
" }\n",
"}\n",
"\n",
"int main(){\n",
" float *a, *b, *out;\n",
" float *d_a, *d_b, *d_out;\n",
"\n",
" // Allocate host memory\n",
" a = (float*)malloc(sizeof(float) * N);\n",
" b = (float*)malloc(sizeof(float) * N);\n",
" out = (float*)malloc(sizeof(float) * N);\n",
"\n",
" // Initialize host arrays\n",
" for(int i = 0; i < N; i++){\n",
" a[i] = i;\n",
" b[i] = 5;\n",
" }\n",
"\n",
" // Allocate device memory\n",
" cudaMalloc((void**)&d_a, sizeof(float) * N);\n",
" cudaMalloc((void**)&d_b, sizeof(float) * N);\n",
" cudaMalloc((void**)&d_out, sizeof(float) * N);\n",
"\n",
" // Transfer data from host to device memory\n",
" cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice);\n",
" cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice);\n",
"\n",
" // Executing kernel\n",
" vector_add<<<1,256>>>(d_out, d_a, d_b, N);\n",
"\n",
" // Transfer data back to host memory\n",
" cudaMemcpy(out, d_out, sizeof(float) * N, cudaMemcpyDeviceToHost);\n",
"\n",
" // Verification\n",
"\n",
"\n",
"\n",
"\n",
" // Deallocate device memory\n",
" cudaFree(d_a);\n",
" cudaFree(d_b);\n",
" cudaFree(d_out);\n",
"\n",
" // Deallocate host memory\n",
" free(a);\n",
" free(b);\n",
" free(out);\n",
"}\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 319
},
"id": "14oI7-QW8EPy",
"outputId": "f2648648-a7a3-49ca-9750-70ba060441cd"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"==596== NVPROF is profiling process 596, command: ./out\n",
"==596== Profiling application: ./out\n",
"==596== Profiling result:\n",
" Type Time(%) Time Calls Avg Min Max Name\n",
" GPU activities: 98.52% 1.0440ms 1 1.0440ms 1.0440ms 1.0440ms vector_add(float*, float*, float*, int)\n",
" 1.09% 11.550us 2 5.7750us 5.7590us 5.7910us [CUDA memcpy HtoD]\n",
" 0.39% 4.0960us 1 4.0960us 4.0960us 4.0960us [CUDA memcpy DtoH]\n",
" API calls: 99.00% 176.44ms 3 58.814ms 6.7960us 176.43ms cudaMalloc\n",
" 0.65% 1.1567ms 3 385.58us 26.484us 1.0936ms cudaMemcpy\n",
" 0.20% 353.24us 1 353.24us 353.24us 353.24us cuDeviceTotalMem\n",
" 0.07% 131.50us 96 1.3690us 127ns 47.376us cuDeviceGetAttribute\n",
" 0.05% 94.987us 3 31.662us 4.9660us 79.056us cudaFree\n",
" 0.02% 27.053us 1 27.053us 27.053us 27.053us cudaLaunchKernel\n",
" 0.01% 15.460us 1 15.460us 15.460us 15.460us cuDeviceGetName\n",
" 0.00% 2.6340us 1 2.6340us 2.6340us 2.6340us cuDeviceGetPCIBusId\n",
" 0.00% 1.7800us 3 593ns 186ns 1.0060us cuDeviceGetCount\n",
" 0.00% 827ns 2 413ns 248ns 579ns cuDeviceGet\n",
" 0.00% 255ns 1 255ns 255ns 255ns cuDeviceGetUuid\n"
]
}
],
"source": [
"!nvcc add2.cu -o out\n",
"!nvprof ./out"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"id": "bgHifWBCOmms",
"outputId": "53a3e591-7a07-4e2f-ad7e-265ad9c63a39"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Writing add3.cu\n"
]
}
],
"source": [
"%%writefile add3.cu\n",
"#include <stdio.h>\n",
"\n",
"#define N 10000\n",
"#define MAX_ER 1e-6\n",
"\n",
"__global__ void vector_add(float *out, float *a, float *b, int n) {\n",
" int index = threadIdx.x;\n",
" int stride = blockDim.x;\n",
"\n",
" for(int i = index; i < n; i += stride){\n",
" out[i] = a[i] + b[i];\n",
" }\n",
"}\n",
"\n",
"int main(){\n",
" float *a, *b, *out;\n",
" float *d_a, *d_b, *d_out;\n",
"\n",
" // Allocate host memory\n",
" a = (float*)malloc(sizeof(float) * N);\n",
" b = (float*)malloc(sizeof(float) * N);\n",
" out = (float*)malloc(sizeof(float) * N);\n",
"\n",
" // Initialize host arrays\n",
" for(int i = 0; i < N; i++){\n",
" a[i] = i+1;\n",
" b[i] = 26;\n",
" }\n",
"\n",
" // Allocate device memory\n",
" cudaMalloc((void**)&d_a, sizeof(float) * N);\n",
" cudaMalloc((void**)&d_b, sizeof(float) * N);\n",
" cudaMalloc((void**)&d_out, sizeof(float) * N);\n",
"\n",
" // Transfer data from host to device memory\n",
" cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice);\n",
" cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice);\n",
"\n",
" // Executing kernel\n",
" vector_add<<<1,512>>>(d_out, d_a, d_b, N);\n",
"\n",
" // Transfer data back to host memory\n",
" cudaMemcpy(out, d_out, sizeof(float) * N, cudaMemcpyDeviceToHost);\n",
"\n",
" // Verification\n",
" //for(int i = 0; i < N; i++){\n",
" // printf(\"%f\\n\",out[i]);\n",
"// }\n",
"\n",
"\n",
"\n",
" // Deallocate device memory\n",
" cudaFree(d_a);\n",
" cudaFree(d_b);\n",
" cudaFree(d_out);\n",
"\n",
" // Deallocate host memory\n",
" free(a);\n",
" free(b);\n",
" free(out);\n",
"}\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"id": "hqZBkrd8x0k4",
"outputId": "48dfa7c6-35ed-43c0-b029-c30e810be629"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Linux\n"
]
}
],
"source": [
"! uname"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 319
},
"id": "tj8bqWhN6_ei",
"outputId": "bd822a17-d13a-486f-d111-ea3d300dfdbb"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"==186== NVPROF is profiling process 186, command: ./out\n",
"==186== Profiling application: ./out\n",
"==186== Profiling result:\n",
" Type Time(%) Time Calls Avg Min Max Name\n",
" GPU activities: 50.83% 14.591us 2 7.2950us 7.2950us 7.2960us [CUDA memcpy HtoD]\n",
" 34.45% 9.8880us 1 9.8880us 9.8880us 9.8880us vector_add(float*, float*, float*, int)\n",
" 14.72% 4.2240us 1 4.2240us 4.2240us 4.2240us [CUDA memcpy DtoH]\n",
" API calls: 99.74% 261.61ms 3 87.202ms 5.2700us 261.59ms cudaMalloc\n",
" 0.09% 239.89us 96 2.4980us 127ns 159.76us cuDeviceGetAttribute\n",
" 0.07% 184.03us 1 184.03us 184.03us 184.03us cuDeviceTotalMem\n",
" 0.04% 104.40us 3 34.800us 28.086us 48.080us cudaMemcpy\n",
" 0.03% 87.576us 3 29.192us 5.0130us 72.747us cudaFree\n",
" 0.01% 27.161us 1 27.161us 27.161us 27.161us cudaLaunchKernel\n",
" 0.01% 21.881us 1 21.881us 21.881us 21.881us cuDeviceGetName\n",
" 0.00% 3.0560us 1 3.0560us 3.0560us 3.0560us cuDeviceGetPCIBusId\n",
" 0.00% 1.6590us 3 553ns 140ns 1.2640us cuDeviceGetCount\n",
" 0.00% 1.2970us 2 648ns 241ns 1.0560us cuDeviceGet\n",
" 0.00% 251ns 1 251ns 251ns 251ns cuDeviceGetUuid\n"
]
}
],
"source": [
"!nvcc add3.cu -o out\n",
"!nvprof ./out"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "7Azzx2U3_DPG",
"outputId": "db632a58-afc9-4c5e-d53a-8639e6b9b287"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Overwriting add4.cu\n"
]
}
],
"source": [
"%%writefile add4.cu\n",
"#include <stdio.h>\n",
"\n",
"#define N 10000\n",
"\n",
"__global__ void vector_add(float *out, float *a, float *b, int n) {\n",
" int tid = blockIdx.x * blockDim.x + threadIdx.x;\n",
" out[tid] = a[tid] + b[tid];\n",
"\n",
"}\n",
"\n",
"int main(){\n",
" float *a, *b, *out;\n",
" float *d_a, *d_b, *d_out;\n",
"\n",
" // Allocate host memory\n",
" a = (float*)malloc(sizeof(float) * N);\n",
" b = (float*)malloc(sizeof(float) * N);\n",
" out = (float*)malloc(sizeof(float) * N);\n",
"\n",
" // Initialize host arrays\n",
" for(int i = 0; i < N; i++){\n",
" a[i] = i+1;\n",
" b[i] = 26;\n",
" }\n",
"\n",
" // Allocate device memory\n",
" cudaMalloc((void**)&d_a, sizeof(float) * N);\n",
" cudaMalloc((void**)&d_b, sizeof(float) * N);\n",
" cudaMalloc((void**)&d_out, sizeof(float) * N);\n",
"\n",
" // Transfer data from host to device memory\n",
" cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice);\n",
" cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice);\n",
"\n",
" // Executing kernel\n",
" int block_size = 32;\n",
" int grid_size = ((N + block_size) / block_size);\n",
" vector_add<<<grid_size,block_size>>>(d_out, d_a, d_b, N);\n",
"\n",
" // Transfer data back to host memory\n",
" cudaMemcpy(out, d_out, sizeof(float) * N, cudaMemcpyDeviceToHost);\n",
"\n",
" // Verification\n",
" //for(int i = 0; i < N; i++){\n",
" // printf(\"%f\\n\",out[i]);\n",
"// }\n",
"\n",
"\n",
"\n",
" // Deallocate device memory\n",
" cudaFree(d_a);\n",
" cudaFree(d_b);\n",
" cudaFree(d_out);\n",
"\n",
" // Deallocate host memory\n",
" free(a);\n",
" free(b);\n",
" free(out);\n",
"}\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "6rUKHYj-_hle",
"outputId": "9615f4ee-05a7-4a1c-da7c-2431745fd9b9"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"==2018== NVPROF is profiling process 2018, command: ./out\n",
"==2018== Profiling application: ./out\n",
"==2018== Profiling result:\n",
" Type Time(%) Time Calls Avg Min Max Name\n",
" GPU activities: 54.56% 10.720us 2 5.3600us 5.2160us 5.5040us [CUDA memcpy HtoD]\n",
" 25.08% 4.9280us 1 4.9280us 4.9280us 4.9280us [CUDA memcpy DtoH]\n",
" 20.36% 4.0000us 1 4.0000us 4.0000us 4.0000us vector_add(float*, float*, float*, int)\n",
" API calls: 99.68% 203.91ms 3 67.971ms 5.1070us 203.90ms cudaMalloc\n",
" 0.11% 214.95us 1 214.95us 214.95us 214.95us cudaLaunchKernel\n",
" 0.07% 137.26us 3 45.753us 5.9300us 121.30us cudaFree\n",
" 0.07% 134.98us 114 1.1840us 144ns 53.102us cuDeviceGetAttribute\n",
" 0.06% 132.93us 3 44.310us 28.425us 62.880us cudaMemcpy\n",
" 0.01% 12.321us 1 12.321us 12.321us 12.321us cuDeviceGetName\n",
" 0.00% 5.5190us 1 5.5190us 5.5190us 5.5190us cuDeviceTotalMem\n",
" 0.00% 5.2280us 1 5.2280us 5.2280us 5.2280us cuDeviceGetPCIBusId\n",
" 0.00% 1.8590us 3 619ns 237ns 1.3680us cuDeviceGetCount\n",
" 0.00% 1.1470us 2 573ns 173ns 974ns cuDeviceGet\n",
" 0.00% 708ns 1 708ns 708ns 708ns cuModuleGetLoadingMode\n",
" 0.00% 228ns 1 228ns 228ns 228ns cuDeviceGetUuid\n"
]
}
],
"source": [
"!nvcc add4.cu -o out\n",
"!nvprof ./out"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "fvbifMBvgpMX"
},
"outputs": [],
"source": []
}
},
"outputs": [],
"source": [
"ref:https://cuda-tutorial.readthedocs.io/en/latest/tutorials/tutorial01/"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c8eff613",
"metadata": {
"vscode": {
"languageId": "plaintext"
],
"metadata": {
"accelerator": "GPU",
"colab": {
"gpuType": "T4",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
}
},
"outputs": [],
"source": [
"import cupy\n",
"\n",
"# size of the vectors\n",
"size = 1024\n",
"\n",
"# allocating and populating the vectors\n",
"a_gpu = cupy.random.rand(size, dtype=cupy.float32)\n",
"b_gpu = cupy.random.rand(size, dtype=cupy.float32)\n",
"c_gpu = cupy.zeros(size, dtype=cupy.float32)\n",
"\n",
"# CUDA vector_add\n",
"vector_add_cuda_code = r'''\n",
"extern \"C\"\n",
"__global__ void vector_add(const float * A, const float * B, float * C, const int size)\n",
"{\n",
" int item = threadIdx.x;\n",
" C[item] = A[item] + B[item];\n",
"}\n",
"'''\n",
"vector_add_gpu = cupy.RawKernel(vector_add_cuda_code, \"vector_add\")\n",
"\n",
"vector_add_gpu((1, 1, 1), (size, 1, 1), (a_gpu, b_gpu, c_gpu, size))"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 5
},
"nbformat": 4,
"nbformat_minor": 0
}
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "5TxHrnfxMYtK"
},
"source": [
"ref: https://cuda-tutorial.readthedocs.io/en/latest/tutorials/tutorial01/"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "bz0VYqONOBe1",
"outputId": "ef1b5673-0037-42a1-d1b6-44ca770bcec7"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"nvcc: NVIDIA (R) Cuda compiler driver\n",
"Copyright (c) 2005-2023 NVIDIA Corporation\n",
"Built on Tue_Aug_15_22:02:13_PDT_2023\n",
"Cuda compilation tools, release 12.2, V12.2.140\n",
"Build cuda_12.2.r12.2/compiler.33191640_0\n"
]
}
],
"source": [
"!nvcc --version"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "y5UasDOL-GN1",
"outputId": "e419a790-c080-4ff3-fcaf-2e7f9879f163"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Writing add1.cu\n"
]
}
],
"source": [
"%%writefile add1.cu\n",
"#include <stdio.h>\n",
"#include <stdlib.h>\n",
"__global__ void add(int *a, int *b) {\n",
"a[0]+= b[0];\n",
"}\n",
"\n",
"int main() {\n",
"int a, b;\n",
"\n",
"// host copies of variables a, b\n",
"int *d_a, *d_b;\n",
"\n",
"// device copies of variables a, b\n",
"int size = sizeof(int);\n",
"\n",
"// Allocate space for device copies of a, b\n",
"cudaMalloc(&d_a, size);\n",
"cudaMalloc(&d_b, size);\n",
"\n",
"// Setup input values\n",
"a = 5;\n",
"b = 100;\n",
"\n",
"// Copy inputs to device\n",
"cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice);\n",
"cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);\n",
"\n",
"// Launch add() kernel on GPU\n",
"add<<<1,1>>>(d_a, d_b);\n",
"\n",
"// Copy result back to host\n",
"cudaError err = cudaMemcpy(&a, d_a, size, cudaMemcpyDeviceToHost);\n",
" if(err!=cudaSuccess) {\n",
" printf(\"CUDA error copying to Host: %s\\n\", cudaGetErrorString(err));\n",
" }\n",
"printf(\"result is %d\\n\",a);\n",
"\n",
"// Cleanup\n",
"cudaFree(d_a);\n",
"cudaFree(d_b);\n",
"\n",
"return 0;\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "i4mVKFx97KTx",
"outputId": "bc0cb70b-7f84-44ca-c5cd-7e9fc648065c"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"==4619== NVPROF is profiling process 4619, command: ./out\n",
"result is 105\n",
"==4619== Profiling application: ./out\n",
"==4619== Profiling result:\n",
" Type Time(%) Time Calls Avg Min Max Name\n",
" GPU activities: 46.26% 3.3600us 1 3.3600us 3.3600us 3.3600us add(int*, int*)\n",
" 28.63% 2.0800us 1 2.0800us 2.0800us 2.0800us [CUDA memcpy DtoH]\n",
" 25.11% 1.8240us 2 912ns 640ns 1.1840us [CUDA memcpy HtoD]\n",
" API calls: 53.54% 150.98ms 2 75.492ms 7.5110us 150.98ms cudaMalloc\n",
" 46.28% 130.52ms 1 130.52ms 130.52ms 130.52ms cudaLaunchKernel\n",
" 0.08% 215.40us 2 107.70us 34.649us 180.76us cudaFree\n",
" 0.07% 194.04us 114 1.7020us 258ns 75.755us cuDeviceGetAttribute\n",
" 0.03% 72.888us 3 24.296us 6.7490us 38.437us cudaMemcpy\n",
" 0.00% 11.698us 1 11.698us 11.698us 11.698us cuDeviceGetName\n",
" 0.00% 8.3590us 1 8.3590us 8.3590us 8.3590us cuDeviceGetPCIBusId\n",
" 0.00% 5.0250us 1 5.0250us 5.0250us 5.0250us cuDeviceTotalMem\n",
" 0.00% 1.9900us 3 663ns 339ns 1.3000us cuDeviceGetCount\n",
" 0.00% 1.0380us 2 519ns 315ns 723ns cuDeviceGet\n",
" 0.00% 642ns 1 642ns 642ns 642ns cuModuleGetLoadingMode\n",
" 0.00% 357ns 1 357ns 357ns 357ns cuDeviceGetUuid\n"
]
}
],
"source": [
"!nvcc add1.cu -o out\n",
"!nvprof ./out"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"id": "xQMHTBWR7x4o",
"outputId": "71790427-a2b9-4f49-dde0-ed1bb2043b53"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Overwriting add2.cu\n"
]
}
],
"source": [
"%%writefile add2.cu\n",
"#include <stdio.h>\n",
"\n",
"#define N 10000\n",
"\n",
"\n",
"__global__ void vector_add(float *out, float *a, float *b, int n) {\n",
" for(int i = 0; i < n; i += 1){\n",
" out[i] = a[i] + b[i];\n",
" }\n",
"}\n",
"\n",
"int main(){\n",
" float *a, *b, *out;\n",
" float *d_a, *d_b, *d_out;\n",
"\n",
" // Allocate host memory\n",
" a = (float*)malloc(sizeof(float) * N);\n",
" b = (float*)malloc(sizeof(float) * N);\n",
" out = (float*)malloc(sizeof(float) * N);\n",
"\n",
" // Initialize host arrays\n",
" for(int i = 0; i < N; i++){\n",
" a[i] = i;\n",
" b[i] = 5;\n",
" }\n",
"\n",
" // Allocate device memory\n",
" cudaMalloc((void**)&d_a, sizeof(float) * N);\n",
" cudaMalloc((void**)&d_b, sizeof(float) * N);\n",
" cudaMalloc((void**)&d_out, sizeof(float) * N);\n",
"\n",
" // Transfer data from host to device memory\n",
" cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice);\n",
" cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice);\n",
"\n",
" // Executing kernel\n",
" vector_add<<<1,256>>>(d_out, d_a, d_b, N);\n",
"\n",
" // Transfer data back to host memory\n",
" cudaMemcpy(out, d_out, sizeof(float) * N, cudaMemcpyDeviceToHost);\n",
"\n",
" // Verification\n",
"\n",
"\n",
"\n",
"\n",
" // Deallocate device memory\n",
" cudaFree(d_a);\n",
" cudaFree(d_b);\n",
" cudaFree(d_out);\n",
"\n",
" // Deallocate host memory\n",
" free(a);\n",
" free(b);\n",
" free(out);\n",
"}\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 319
},
"id": "14oI7-QW8EPy",
"outputId": "f2648648-a7a3-49ca-9750-70ba060441cd"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"==596== NVPROF is profiling process 596, command: ./out\n",
"==596== Profiling application: ./out\n",
"==596== Profiling result:\n",
" Type Time(%) Time Calls Avg Min Max Name\n",
" GPU activities: 98.52% 1.0440ms 1 1.0440ms 1.0440ms 1.0440ms vector_add(float*, float*, float*, int)\n",
" 1.09% 11.550us 2 5.7750us 5.7590us 5.7910us [CUDA memcpy HtoD]\n",
" 0.39% 4.0960us 1 4.0960us 4.0960us 4.0960us [CUDA memcpy DtoH]\n",
" API calls: 99.00% 176.44ms 3 58.814ms 6.7960us 176.43ms cudaMalloc\n",
" 0.65% 1.1567ms 3 385.58us 26.484us 1.0936ms cudaMemcpy\n",
" 0.20% 353.24us 1 353.24us 353.24us 353.24us cuDeviceTotalMem\n",
" 0.07% 131.50us 96 1.3690us 127ns 47.376us cuDeviceGetAttribute\n",
" 0.05% 94.987us 3 31.662us 4.9660us 79.056us cudaFree\n",
" 0.02% 27.053us 1 27.053us 27.053us 27.053us cudaLaunchKernel\n",
" 0.01% 15.460us 1 15.460us 15.460us 15.460us cuDeviceGetName\n",
" 0.00% 2.6340us 1 2.6340us 2.6340us 2.6340us cuDeviceGetPCIBusId\n",
" 0.00% 1.7800us 3 593ns 186ns 1.0060us cuDeviceGetCount\n",
" 0.00% 827ns 2 413ns 248ns 579ns cuDeviceGet\n",
" 0.00% 255ns 1 255ns 255ns 255ns cuDeviceGetUuid\n"
]
}
],
"source": [
"!nvcc add2.cu -o out\n",
"!nvprof ./out"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"id": "bgHifWBCOmms",
"outputId": "53a3e591-7a07-4e2f-ad7e-265ad9c63a39"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Writing add3.cu\n"
]
}
],
"source": [
"%%writefile add3.cu\n",
"#include <stdio.h>\n",
"\n",
"#define N 10000\n",
"#define MAX_ER 1e-6\n",
"\n",
"__global__ void vector_add(float *out, float *a, float *b, int n) {\n",
" int index = threadIdx.x;\n",
" int stride = blockDim.x;\n",
"\n",
" for(int i = index; i < n; i += stride){\n",
" out[i] = a[i] + b[i];\n",
" }\n",
"}\n",
"\n",
"int main(){\n",
" float *a, *b, *out;\n",
" float *d_a, *d_b, *d_out;\n",
"\n",
" // Allocate host memory\n",
" a = (float*)malloc(sizeof(float) * N);\n",
" b = (float*)malloc(sizeof(float) * N);\n",
" out = (float*)malloc(sizeof(float) * N);\n",
"\n",
" // Initialize host arrays\n",
" for(int i = 0; i < N; i++){\n",
" a[i] = i+1;\n",
" b[i] = 26;\n",
" }\n",
"\n",
" // Allocate device memory\n",
" cudaMalloc((void**)&d_a, sizeof(float) * N);\n",
" cudaMalloc((void**)&d_b, sizeof(float) * N);\n",
" cudaMalloc((void**)&d_out, sizeof(float) * N);\n",
"\n",
" // Transfer data from host to device memory\n",
" cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice);\n",
" cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice);\n",
"\n",
" // Executing kernel\n",
" vector_add<<<1,512>>>(d_out, d_a, d_b, N);\n",
"\n",
" // Transfer data back to host memory\n",
" cudaMemcpy(out, d_out, sizeof(float) * N, cudaMemcpyDeviceToHost);\n",
"\n",
" // Verification\n",
" //for(int i = 0; i < N; i++){\n",
" // printf(\"%f\\n\",out[i]);\n",
"// }\n",
"\n",
"\n",
"\n",
" // Deallocate device memory\n",
" cudaFree(d_a);\n",
" cudaFree(d_b);\n",
" cudaFree(d_out);\n",
"\n",
" // Deallocate host memory\n",
" free(a);\n",
" free(b);\n",
" free(out);\n",
"}\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"id": "hqZBkrd8x0k4",
"outputId": "48dfa7c6-35ed-43c0-b029-c30e810be629"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Linux\n"
]
}
],
"source": [
"! uname"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 319
},
"id": "tj8bqWhN6_ei",
"outputId": "bd822a17-d13a-486f-d111-ea3d300dfdbb"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"==186== NVPROF is profiling process 186, command: ./out\n",
"==186== Profiling application: ./out\n",
"==186== Profiling result:\n",
" Type Time(%) Time Calls Avg Min Max Name\n",
" GPU activities: 50.83% 14.591us 2 7.2950us 7.2950us 7.2960us [CUDA memcpy HtoD]\n",
" 34.45% 9.8880us 1 9.8880us 9.8880us 9.8880us vector_add(float*, float*, float*, int)\n",
" 14.72% 4.2240us 1 4.2240us 4.2240us 4.2240us [CUDA memcpy DtoH]\n",
" API calls: 99.74% 261.61ms 3 87.202ms 5.2700us 261.59ms cudaMalloc\n",
" 0.09% 239.89us 96 2.4980us 127ns 159.76us cuDeviceGetAttribute\n",
" 0.07% 184.03us 1 184.03us 184.03us 184.03us cuDeviceTotalMem\n",
" 0.04% 104.40us 3 34.800us 28.086us 48.080us cudaMemcpy\n",
" 0.03% 87.576us 3 29.192us 5.0130us 72.747us cudaFree\n",
" 0.01% 27.161us 1 27.161us 27.161us 27.161us cudaLaunchKernel\n",
" 0.01% 21.881us 1 21.881us 21.881us 21.881us cuDeviceGetName\n",
" 0.00% 3.0560us 1 3.0560us 3.0560us 3.0560us cuDeviceGetPCIBusId\n",
" 0.00% 1.6590us 3 553ns 140ns 1.2640us cuDeviceGetCount\n",
" 0.00% 1.2970us 2 648ns 241ns 1.0560us cuDeviceGet\n",
" 0.00% 251ns 1 251ns 251ns 251ns cuDeviceGetUuid\n"
]
}
],
"source": [
"!nvcc add3.cu -o out\n",
"!nvprof ./out"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "7Azzx2U3_DPG",
"outputId": "db632a58-afc9-4c5e-d53a-8639e6b9b287"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Overwriting add4.cu\n"
]
}
],
"source": [
"%%writefile add4.cu\n",
"#include <stdio.h>\n",
"\n",
"#define N 10000\n",
"\n",
"__global__ void vector_add(float *out, float *a, float *b, int n) {\n",
" int tid = blockIdx.x * blockDim.x + threadIdx.x;\n",
" out[tid] = a[tid] + b[tid];\n",
"\n",
"}\n",
"\n",
"int main(){\n",
" float *a, *b, *out;\n",
" float *d_a, *d_b, *d_out;\n",
"\n",
" // Allocate host memory\n",
" a = (float*)malloc(sizeof(float) * N);\n",
" b = (float*)malloc(sizeof(float) * N);\n",
" out = (float*)malloc(sizeof(float) * N);\n",
"\n",
" // Initialize host arrays\n",
" for(int i = 0; i < N; i++){\n",
" a[i] = i+1;\n",
" b[i] = 26;\n",
" }\n",
"\n",
" // Allocate device memory\n",
" cudaMalloc((void**)&d_a, sizeof(float) * N);\n",
" cudaMalloc((void**)&d_b, sizeof(float) * N);\n",
" cudaMalloc((void**)&d_out, sizeof(float) * N);\n",
"\n",
" // Transfer data from host to device memory\n",
" cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice);\n",
" cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice);\n",
"\n",
" // Executing kernel\n",
" int block_size = 32;\n",
" int grid_size = ((N + block_size) / block_size);\n",
" vector_add<<<grid_size,block_size>>>(d_out, d_a, d_b, N);\n",
"\n",
" // Transfer data back to host memory\n",
" cudaMemcpy(out, d_out, sizeof(float) * N, cudaMemcpyDeviceToHost);\n",
"\n",
" // Verification\n",
" //for(int i = 0; i < N; i++){\n",
" // printf(\"%f\\n\",out[i]);\n",
"// }\n",
"\n",
"\n",
"\n",
" // Deallocate device memory\n",
" cudaFree(d_a);\n",
" cudaFree(d_b);\n",
" cudaFree(d_out);\n",
"\n",
" // Deallocate host memory\n",
" free(a);\n",
" free(b);\n",
" free(out);\n",
"}\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "6rUKHYj-_hle",
"outputId": "9615f4ee-05a7-4a1c-da7c-2431745fd9b9"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"==2018== NVPROF is profiling process 2018, command: ./out\n",
"==2018== Profiling application: ./out\n",
"==2018== Profiling result:\n",
" Type Time(%) Time Calls Avg Min Max Name\n",
" GPU activities: 54.56% 10.720us 2 5.3600us 5.2160us 5.5040us [CUDA memcpy HtoD]\n",
" 25.08% 4.9280us 1 4.9280us 4.9280us 4.9280us [CUDA memcpy DtoH]\n",
" 20.36% 4.0000us 1 4.0000us 4.0000us 4.0000us vector_add(float*, float*, float*, int)\n",
" API calls: 99.68% 203.91ms 3 67.971ms 5.1070us 203.90ms cudaMalloc\n",
" 0.11% 214.95us 1 214.95us 214.95us 214.95us cudaLaunchKernel\n",
" 0.07% 137.26us 3 45.753us 5.9300us 121.30us cudaFree\n",
" 0.07% 134.98us 114 1.1840us 144ns 53.102us cuDeviceGetAttribute\n",
" 0.06% 132.93us 3 44.310us 28.425us 62.880us cudaMemcpy\n",
" 0.01% 12.321us 1 12.321us 12.321us 12.321us cuDeviceGetName\n",
" 0.00% 5.5190us 1 5.5190us 5.5190us 5.5190us cuDeviceTotalMem\n",
" 0.00% 5.2280us 1 5.2280us 5.2280us 5.2280us cuDeviceGetPCIBusId\n",
" 0.00% 1.8590us 3 619ns 237ns 1.3680us cuDeviceGetCount\n",
" 0.00% 1.1470us 2 573ns 173ns 974ns cuDeviceGet\n",
" 0.00% 708ns 1 708ns 708ns 708ns cuModuleGetLoadingMode\n",
" 0.00% 228ns 1 228ns 228ns 228ns cuDeviceGetUuid\n"
]
}
],
"source": [
"!nvcc add4.cu -o out\n",
"!nvprof ./out"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "fvbifMBvgpMX"
},
"outputs": [],
"source": []
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"gpuType": "T4",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "b6d3aebb",
"metadata": {
"vscode": {
"languageId": "plaintext"
}
},
"outputs": [],
"source": [
"ref:https://cuda-tutorial.readthedocs.io/en/latest/tutorials/tutorial01/"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c8eff613",
"metadata": {
"vscode": {
"languageId": "plaintext"
}
},
"outputs": [],
"source": [
"import cupy\n",
"\n",
"# size of the vectors\n",
"size = 1024\n",
"\n",
"# allocating and populating the vectors\n",
"a_gpu = cupy.random.rand(size, dtype=cupy.float32)\n",
"b_gpu = cupy.random.rand(size, dtype=cupy.float32)\n",
"c_gpu = cupy.zeros(size, dtype=cupy.float32)\n",
"\n",
"# CUDA vector_add\n",
"vector_add_cuda_code = r'''\n",
"extern \"C\"\n",
"__global__ void vector_add(const float * A, const float * B, float * C, const int size)\n",
"{\n",
" int item = threadIdx.x;\n",
" C[item] = A[item] + B[item];\n",
"}\n",
"'''\n",
"vector_add_gpu = cupy.RawKernel(vector_add_cuda_code, \"vector_add\")\n",
"\n",
"vector_add_gpu((1, 1, 1), (size, 1, 1), (a_gpu, b_gpu, c_gpu, size))"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
# GPU Programming
A collection of notebooks for learning and practicing GPU programming with CUDA.
## Contents
### Notebooks
- **GPU-Introduction.ipynb** - Introduction to GPU computing concepts and fundamentals
- **GPU-Check ENV.ipynb** - Verify and check GPU environment setup and CUDA configuration
- **GPU-HW CUDA Programming.ipynb** - HomeWrok CUDA programming exercises
- **GPU-Lab C Kernel.ipynb** - CUDA kernel programming with C language
- **GPU-Lab Py Kernel.ipynb** - CUDA kernel programming using CuPy in Python
## Overview
This workspace contains practical exercises and examples for GPU programming, including:
- Vector operations on GPU
- CUDA kernel implementation and execution
- GPU memory management
- Performance optimization techniques
## Requirements
- CUDA Toolkit installed
- CuPy (for Python CUDA programming)
- Jupyter notebook environment
## Getting Started
Start with `GPU-Introduction.ipynb` for foundational concepts, then progress through the other notebooks to practice implementing CUDA kernels and GPU operations.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment