Adding the solutions

eab6b8e5 · Humam Tlay · 10d4e17f · eab6b8e5 · eab6b8e5 · eab6b8e5
Commit eab6b8e5 authored Jan 31, 2024 by Humam Tlay
Show whitespace changes
Inline Side-by-side

Showing with 150 additions and 0 deletions

readme.md CUDA/readme.md +24 -0

Q1.c OpenMP/Q1.c +28 -0

readme.md OpenMP/readme.md +4 -0

updated.cu OpenMP/updated.cu +93 -0

README.md README.md +1 -0

No files found.
--- a/CUDA/readme.md
+++ b/CUDA/readme.md
+# CUDA Report
+## First Question
+In the provided CUDA programs, the error lies in the kernel invocation parameters of the vector_add function in the add_Vec4.cu file.
+
+Here's the kernel invocation in add_Vec4.cu:
+```
+vector_add<<<1,256>>>(d_out, d_a, d_b, N);
+```
+
+In CUDA, the kernel invocation `<<<gridDim, blockDim>>>(...)` specifies how many blocks and threads per block should be launched to execute the kernel function. In this case, `<<<1,256>>>` means there's only one block with 256 threads.
+
+However, the vector_add kernel expects to use the tid (thread ID) variable to compute the index for each thread's work:
+```
+int tid = blockIdx.x * blockDim.x + threadIdx.x;
+out[tid] = a[tid] + b[tid];
+```
+With the current invocation, the `blockIdx.x * blockDim.x + threadIdx.x` computation assumes that multiple blocks are being used to cover the entire array out. However, with only one block launched (as specified by `<<<1,256>>>`), `blockIdx.x` is always 0, and `blockDim.x` is 256. As a result, tid will only range from 0 to 255, which means only the first 256 elements of out, a, and b will be computed correctly.
+
+## Second Question
+To take advantage of the parallel architecture of CUDA GPUs and improve the performance of the `add_Vec4.cu` program, We can modify the kernel launch configuration to utilize multiple blocks. Currently, the program launches only one block with 256 threads.
+
+To utilize multiple blocks, we can modify the kernel launch configuration by adjusting the number of blocks and threads per block. We can experiment with different configurations and observe the performance impact.
+
+The modified version is attached in the `updated.cu` file. In this modified version, numBlocks is calculated based on the size of the arrays (N) and the desired blockSize. The number of blocks is calculated to cover the entire data set (N) with each block processing a portion of the data.
\ No newline at end of file
--- a/OpenMP/Q1.c
+++ b/OpenMP/Q1.c
+#include <stdio.h>
+#include <omp.h>
+
+#define NUM_THREADS 4
+#define NUM_SEGMENTS 1000000
+#define SEGMENT_WIDTH (1.0 / NUM_SEGMENTS)
+
+double function(double x) {
+    return x * x; // The function to be integrated
+}
+
+int main() {
+    double integral = 0.0;
+    int i;
+
+    #pragma omp parallel for num_threads(NUM_THREADS) reduction(+:integral)
+    for (i = 0; i < NUM_SEGMENTS; i++) {
+        double x = (i + 0.5) * SEGMENT_WIDTH; // Midpoint of the segment
+        double area = function(x) * SEGMENT_WIDTH; // Area of the rectangle
+        integral += area;
+    }
+
+    integral *= 2.0; // Multiply by 2 since we're integrating from 0 to 1
+
+    printf("Integral: %.10f\n", integral);
+
+    return 0;
+}
\ No newline at end of file
--- a/OpenMP/readme.md
+++ b/OpenMP/readme.md
+# OpenMP Report
+
+## Question 1
+MPI is more suitable for distributed-memory parallelism, where multiple processes communicate over a network. It is generally used when parallelizing computations across multiple nodes in a cluster. Since the problem seems to involve shared-memory parallelism within a single system, OpenMP is a more appropriate choice.
\ No newline at end of file
--- a/OpenMP/updated.cu
+++ b/OpenMP/updated.cu
+#include <stdio.h>
+#include <stdlib.h>
+#include <cuda_runtime.h>
+
+#define N 10000
+
+__global__ void vector_add(float *out, float *a, float *b, int n) {
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    out[tid] = a[tid] + b[tid];
+}
+
+// Function to measure time for a kernel launch
+float timeKernel(int blockSize, float *d_out, float *d_a, float *d_b, int n) {
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+
+    cudaEventRecord(start);
+
+    // Executing kernel with specified block size
+    vector_add<<<(n + blockSize - 1) / blockSize, blockSize>>>(d_out, d_a, d_b, n);
+
+    cudaEventRecord(stop);
+    cudaEventSynchronize(stop);
+
+    float milliseconds = 0;
+    cudaEventElapsedTime(&milliseconds, start, stop);
+
+    cudaEventDestroy(start);
+    cudaEventDestroy(stop);
+
+    return milliseconds;
+}
+
+int main() {
+    float *a, *b, *out;
+    float *d_a, *d_b, *d_out;
+
+    // Allocate host memory
+    a = (float*)malloc(sizeof(float) * N);
+    b = (float*)malloc(sizeof(float) * N);
+    out = (float*)malloc(sizeof(float) * N);
+
+    // Initialize host arrays
+    for (int i = 0; i < N; i++) {
+        a[i] = i + 1;
+        b[i] = 26;
+    }
+
+    // Allocate device memory
+    cudaMalloc((void**)&d_a, sizeof(float) * N);
+    cudaMalloc((void**)&d_b, sizeof(float) * N);
+    cudaMalloc((void**)&d_out, sizeof(float) * N);
+
+    // Transfer data from host to device memory
+    cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice);
+    cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice);
+
+    // Test different block sizes and measure time
+    printf("Block Size \t Time (ms)\n");
+    printf("----------- \t ----------\n");
+
+    int blockSizes[] = {32, 128, 512, 1024};
+    int numBlockSizes = sizeof(blockSizes) / sizeof(blockSizes[0]);
+
+    for (int i = 0; i < numBlockSizes; i++) {
+        int blockSize = blockSizes[i];
+        float elapsed_time = timeKernel(blockSize, d_out, d_a, d_b, N);
+        printf("%4d \t\t %f\n", blockSize, elapsed_time);
+    }
+
+    // Transfer data back to host memory
+    cudaMemcpy(out, d_out, sizeof(float) * N, cudaMemcpyDeviceToHost);
+
+    // Verification (if needed)
+    /*
+    for(int i = 0; i < N; i++){
+        printf("%f\n", out[i]);
+    }
+    */
+
+    // Deallocate device memory
+    cudaFree(d_a);
+    cudaFree(d_b);
+    cudaFree(d_out);
+
+    // Deallocate host memory
+    free(a);
+    free(b);
+    free(out);
+
+    return 0;
+}
\ No newline at end of file
--- a/README.md
+++ b/README.md
 # OpenMp and CUDA

+The solutions are in the readme files in subdirectories
\ No newline at end of file