Commit 90afb7f5 authored by mohammad.salama's avatar mohammad.salama

Report + Solutions

parent ccf90713
these folders has c codes to solve first question (integral of x^2)
in each case (OpenMP - MPI - OpenMP&MPI)
the folders in "First Question - Part 1" have c codes to solve first question (integral of x^2)
in each case (OpenMP - MPI - OpenMP&MPI)
they implement solution and each one prints its time. they implement solution and each one prints its time.
note : they work on any integral just edit the integral_start and integral_end.
NOTE : they work on any integral just edit the integral_start and integral_end.
the text file "Second Question - Part 2 - Statistics" contains the ouput resulting from running the output of
"CUDA-add_Vec4.cu" on google colab.
#include <stdio.h>
#include <stdlib.h>
#include <stdio.h>
#define N 10000
__global__ void vector_add(float *out, float *a, float *b, int n) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
out[tid] = a[tid] + b[tid];
}
int main(){
float *a, *b, *out , *c;
float *d_a, *d_b, *d_out;
// Allocate host memory
a = (float*)malloc(sizeof(float) * N);
b = (float*)malloc(sizeof(float) * N);
c = (float*)malloc(sizeof(float) * N);
out = (float*)malloc(sizeof(float) * N);
// Initialize host arrays
for(int i = 0; i < N; i++){
a[i] = i+1;
b[i] = 26;
c[i] = a[i] + b[i];
}
// Allocate device memory
cudaMalloc((void**)&d_a, sizeof(float) * N);
cudaMalloc((void**)&d_b, sizeof(float) * N);
cudaMalloc((void**)&d_out, sizeof(float) * N);
// Transfer data from host to device memory
cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice);
// Executing kernel
vector_add<<<1,32>>>(d_out, d_a, d_b, N);
// Transfer data back to host memory
cudaMemcpy(out, d_out, sizeof(float) * N, cudaMemcpyDeviceToHost);
// Verification
for(int i = 0; i < N; i++)
{
if (c[i] != out[i]) printf("error at index %d\n",i);
}
// Deallocate device memory
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_out);
// Deallocate host memory
free(a);
free(b);
free(out);
return 0;
}
\ No newline at end of file
Statistics for different block size:
size = 32
==2108== NVPROF is profiling process 2108, command: ./out
==2108== Profiling application: ./out
==2108== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 56.31% 11.712us 2 5.8560us 5.7920us 5.9200us [CUDA memcpy HtoD]
26.92% 5.6000us 1 5.6000us 5.6000us 5.6000us [CUDA memcpy DtoH]
16.77% 3.4880us 1 3.4880us 3.4880us 3.4880us vector_add(float*, float*, float*, int)
API calls: 99.69% 208.02ms 3 69.341ms 4.7210us 208.01ms cudaMalloc
0.09% 186.86us 1 186.86us 186.86us 186.86us cudaLaunchKernel
0.07% 155.85us 3 51.951us 35.421us 79.319us cudaMemcpy
0.07% 155.00us 114 1.3590us 165ns 57.863us cuDeviceGetAttribute
0.06% 116.71us 3 38.903us 4.3660us 102.56us cudaFree
0.01% 14.228us 1 14.228us 14.228us 14.228us cuDeviceGetName
0.00% 8.9680us 1 8.9680us 8.9680us 8.9680us cuDeviceTotalMem
0.00% 6.8000us 1 6.8000us 6.8000us 6.8000us cuDeviceGetPCIBusId
0.00% 2.1840us 3 728ns 261ns 1.6210us cuDeviceGetCount
0.00% 1.3430us 2 671ns 213ns 1.1300us cuDeviceGet
0.00% 614ns 1 614ns 614ns 614ns cuModuleGetLoadingMode
0.00% 281ns 1 281ns 281ns 281ns cuDeviceGetUuid
size = 128
==2476== NVPROF is profiling process 2476, command: ./out
==2476== Profiling application: ./out
==2476== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 57.07% 11.359us 2 5.6790us 5.6000us 5.7590us [CUDA memcpy HtoD]
26.05% 5.1840us 1 5.1840us 5.1840us 5.1840us [CUDA memcpy DtoH]
16.88% 3.3600us 1 3.3600us 3.3600us 3.3600us vector_add(float*, float*, float*, int)
API calls: 99.70% 198.27ms 3 66.092ms 3.4580us 198.26ms cudaMalloc
0.10% 194.99us 1 194.99us 194.99us 194.99us cudaLaunchKernel
0.07% 142.93us 114 1.2530us 166ns 54.788us cuDeviceGetAttribute
0.06% 117.05us 3 39.017us 24.448us 57.669us cudaMemcpy
0.05% 107.18us 3 35.727us 3.8760us 95.703us cudaFree
0.01% 12.653us 1 12.653us 12.653us 12.653us cuDeviceGetName
0.00% 6.6270us 1 6.6270us 6.6270us 6.6270us cuDeviceGetPCIBusId
0.00% 4.8520us 1 4.8520us 4.8520us 4.8520us cuDeviceTotalMem
0.00% 1.8900us 3 630ns 241ns 1.3090us cuDeviceGetCount
0.00% 1.2750us 2 637ns 226ns 1.0490us cuDeviceGet
0.00% 550ns 1 550ns 550ns 550ns cuModuleGetLoadingMode
0.00% 237ns 1 237ns 237ns 237ns cuDeviceGetUuid
size = 256
==2626== NVPROF is profiling process 2626, command: ./out
==2626== Profiling application: ./out
==2626== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 58.49% 11.903us 2 5.9510us 5.7910us 6.1120us [CUDA memcpy HtoD]
25.00% 5.0880us 1 5.0880us 5.0880us 5.0880us [CUDA memcpy DtoH]
16.51% 3.3600us 1 3.3600us 3.3600us 3.3600us vector_add(float*, float*, float*, int)
API calls: 99.68% 233.33ms 3 77.777ms 5.1320us 233.32ms cudaMalloc
0.10% 223.04us 1 223.04us 223.04us 223.04us cudaLaunchKernel
0.09% 209.17us 114 1.8340us 198ns 78.522us cuDeviceGetAttribute
0.06% 144.84us 3 48.280us 22.959us 62.714us cudaMemcpy
0.06% 134.05us 3 44.682us 5.7930us 118.14us cudaFree
0.01% 15.809us 1 15.809us 15.809us 15.809us cuDeviceGetName
0.00% 8.2400us 1 8.2400us 8.2400us 8.2400us cuDeviceGetPCIBusId
0.00% 6.5570us 1 6.5570us 6.5570us 6.5570us cuDeviceTotalMem
0.00% 3.0730us 3 1.0240us 421ns 2.2110us cuDeviceGetCount
0.00% 1.2450us 2 622ns 326ns 919ns cuDeviceGet
0.00% 640ns 1 640ns 640ns 640ns cuModuleGetLoadingMode
0.00% 490ns 1 490ns 490ns 490ns cuDeviceGetUuid
size = 512
==2756== NVPROF is profiling process 2756, command: ./out
==2756== Profiling application: ./out
==2756== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 56.17% 11.072us 2 5.5360us 5.3440us 5.7280us [CUDA memcpy HtoD]
25.97% 5.1190us 1 5.1190us 5.1190us 5.1190us [CUDA memcpy DtoH]
17.86% 3.5200us 1 3.5200us 3.5200us 3.5200us vector_add(float*, float*, float*, int)
API calls: 99.72% 196.88ms 3 65.628ms 3.0210us 196.88ms cudaMalloc
0.09% 174.50us 1 174.50us 174.50us 174.50us cudaLaunchKernel
0.07% 140.53us 114 1.2320us 160ns 55.528us cuDeviceGetAttribute
0.06% 111.19us 3 37.063us 24.718us 53.988us cudaMemcpy
0.05% 101.55us 3 33.849us 3.7220us 90.602us cudaFree
0.01% 12.820us 1 12.820us 12.820us 12.820us cuDeviceGetName
0.00% 5.6640us 1 5.6640us 5.6640us 5.6640us cuDeviceGetPCIBusId
0.00% 4.7700us 1 4.7700us 4.7700us 4.7700us cuDeviceTotalMem
0.00% 2.4160us 3 805ns 254ns 1.8550us cuDeviceGetCount
0.00% 1.0750us 2 537ns 219ns 856ns cuDeviceGet
0.00% 605ns 1 605ns 605ns 605ns cuModuleGetLoadingMode
0.00% 212ns 1 212ns 212ns 212ns cuDeviceGetUuid
size = 1024
==2864== NVPROF is profiling process 2864, command: ./out
==2864== Profiling application: ./out
==2864== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 54.15% 11.264us 2 5.6320us 5.5360us 5.7280us [CUDA memcpy HtoD]
27.38% 5.6960us 1 5.6960us 5.6960us 5.6960us [CUDA memcpy DtoH]
18.46% 3.8400us 1 3.8400us 3.8400us 3.8400us vector_add(float*, float*, float*, int)
API calls: 99.69% 199.51ms 3 66.502ms 3.1840us 199.49ms cudaMalloc
0.09% 172.99us 114 1.5170us 154ns 79.193us cuDeviceGetAttribute
0.09% 170.38us 1 170.38us 170.38us 170.38us cudaLaunchKernel
0.07% 135.40us 3 45.132us 25.390us 76.857us cudaMemcpy
0.06% 117.82us 3 39.272us 5.3020us 102.82us cudaFree
0.01% 14.332us 1 14.332us 14.332us 14.332us cuDeviceGetName
0.00% 9.6620us 1 9.6620us 9.6620us 9.6620us cuDeviceGetPCIBusId
0.00% 4.7660us 1 4.7660us 4.7660us 4.7660us cuDeviceTotalMem
0.00% 1.8000us 3 600ns 221ns 1.2840us cuDeviceGetCount
0.00% 1.0390us 2 519ns 202ns 837ns cuDeviceGet
0.00% 673ns 1 673ns 673ns 673ns cuModuleGetLoadingMode
0.00% 279ns 1 279ns 279ns 279ns cuDeviceGetUuid
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment