Commit 4104e57c authored by mohamadbashar.disoki's avatar mohamadbashar.disoki

Update OpenMP CUDA

parent 3fd570ea
#include <stdio.h>
#define N 10000
__global__ void vector_add(float *out, float *a, float *b, int n) {
for(int i = 0; i < n; i += 1){
out[i] = a[i] + b[i];
}
}
int main(){
float *a, *b, *out;
float *d_a, *d_b, *d_out;
// Allocate host memory
a = (float*)malloc(sizeof(float) * N);
b = (float*)malloc(sizeof(float) * N);
out = (float*)malloc(sizeof(float) * N);
// Initialize host arrays
for(int i = 0; i < N; i++){
a[i] = i;
b[i] = 5;
}
// Allocate device memory
cudaMalloc((void**)&d_a, sizeof(float) * N);
cudaMalloc((void**)&d_b, sizeof(float) * N);
cudaMalloc((void**)&d_out, sizeof(float) * N);
// Transfer data from host to device memory
cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice);
// Executing kernel
vector_add<<<1,256>>>(d_out, d_a, d_b, N);
// Transfer data back to host memory
cudaMemcpy(out, d_out, sizeof(float) * N, cudaMemcpyDeviceToHost);
// Verification
//for(int i = 0; i < N; i++){
// printf("%f\n",out[i]);
// }
// Deallocate device memory
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_out);
// Deallocate host memory
free(a);
free(b);
free(out);
}
\ No newline at end of file
#include <stdio.h>
#define N 10000
#define MAX_ER 1e-6
__global__ void vector_add(float *out, float *a, float *b, int n) {
int index = threadIdx.x;
int stride = blockDim.x;
for(int i = index; i < n; i += stride){
out[i] = a[i] + b[i];
}
}
int main(){
float *a, *b, *out;
float *d_a, *d_b, *d_out;
// Allocate host memory
a = (float*)malloc(sizeof(float) * N);
b = (float*)malloc(sizeof(float) * N);
out = (float*)malloc(sizeof(float) * N);
// Initialize host arrays
for(int i = 0; i < N; i++){
a[i] = i+1;
b[i] = 26;
}
// Allocate device memory
cudaMalloc((void**)&d_a, sizeof(float) * N);
cudaMalloc((void**)&d_b, sizeof(float) * N);
cudaMalloc((void**)&d_out, sizeof(float) * N);
// Transfer data from host to device memory
cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice);
// Executing kernel
vector_add<<<1,512>>>(d_out, d_a, d_b, N);
// Transfer data back to host memory
cudaMemcpy(out, d_out, sizeof(float) * N, cudaMemcpyDeviceToHost);
// Verification
//for(int i = 0; i < N; i++){
// printf("%f\n",out[i]);
// }
// Deallocate device memory
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_out);
// Deallocate host memory
free(a);
free(b);
free(out);
}
\ No newline at end of file
#include <stdio.h>
#define N 10000
__global__ void vector_add(float *out, float *a, float *b, int n) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
out[tid] = a[tid] + b[tid];
}
int main(){
float *a, *b, *out;
float *d_a, *d_b, *d_out;
// Allocate host memory
a = (float*)malloc(sizeof(float) * N);
b = (float*)malloc(sizeof(float) * N);
out = (float*)malloc(sizeof(float) * N);
// Initialize host arrays
for(int i = 0; i < N; i++){
a[i] = i+1;
b[i] = 26;
}
// Allocate device memory
cudaMalloc((void**)&d_a, sizeof(float) * N);
cudaMalloc((void**)&d_b, sizeof(float) * N);
cudaMalloc((void**)&d_out, sizeof(float) * N);
// Transfer data from host to device memory
cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice);
// Executing kernel
int block_size = 512;
int grid_size = ((N + block_size) / block_size);
vector_add<<<grid_size,block_size>>>(d_out, d_a, d_b, N);
// Transfer data back to host memory
cudaMemcpy(out, d_out, sizeof(float) * N, cudaMemcpyDeviceToHost);
// Verification
//for(int i = 0; i < N; i++){
// printf("%f\n",out[i]);
// }
// Deallocate device memory
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_out);
// Deallocate host memory
free(a);
free(b);
free(out);
}
# Makefile
# Source file
SRC = helloWorld.c
SRC = sync_lock.c
# Compiler
CC = gcc
......
#include <stdio.h>
#include <omp.h>
//simple example to demonstrate race condition in parallel programs and barrier
void start()
{
#pragma omp parallel num_threads(12)
{
int threadId = omp_get_thread_num();
#pragma omp single
{
printf("====> I am Thread %d Inside the single region <===\n", threadId);
}
#pragma omp master
{
printf("====> I am sure that this is the master thread, My id = %d <===\n", threadId);
}
}
}
void main(int count, char *arg[])
{
start();
}
\ No newline at end of file
......@@ -17,11 +17,11 @@ void startParallelHeloworld()
printf("Hello From Master: => Threads Count is: %d \n", omp_get_num_threads());
else
printf("Hello from Thread %d\n", id);
for(int i=0;i<1000;i++){
for(int i=0;i<10000;i++){
x++;
}
}
//printf("Done x=%d",x);
printf("Done x=%d\n",x);
}
//SPMD
......
File added
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
static long steps = 1000000000;
double step;
int main (int argc, const char *argv[]) {
int i,j;
double x;
double pi, sum = 0.0;
double start, delta;
step = 1.0/(double) steps;
sum = 0.0;
#pragma omp parallel for reduction(+:sum) private(x)
for (i=0; i < steps; i++) {
x = (i+0.5)*step;
sum += 4.0 / (1.0+x*x);
}
// Out of the parallel region, finialize computation
pi = step * sum;
printf("PI = %.16g\n", pi);
}
\ No newline at end of file
#include <stdio.h>
#include <omp.h>
//simple example to demonstrate race condition in parallel programs
void start()
{
int x = 2;
#pragma omp parallel num_threads(20)
{
int threadId = omp_get_thread_num();
if (threadId == 5)
x = 5;
else
printf("I am Thread %d and for me x=%d \n", threadId, x);
#pragma omp barrier
printf("After barrier => I'am thread %d and x=%d\n",threadId,x);
}
}
void main(int count, char *arg[])
{
start();
}
#include <stdio.h>
#include <omp.h>
//simple example to demonstrate race condition in parallel programs and barrier
void start()
{
int i=0, length = 10;
int array[length];
for(i = 0; i < length; i++)
array[i] = i;
int index = 0;
double sum = 0.0, avg = 0.0;
#pragma omp parallel for reduction (+:sum)
for(index = 0; index < length; index++)
sum += array[index];
//calculate average
avg = sum/(double)length;
printf("Average is avg = %f\n", avg);
}
void main(int count, char *arg[])
{
start();
}
#include <stdio.h>
#include <omp.h>
//simple example to demonstrate race condition in parallel programs and barrier
void start()
{
int x = 2;
omp_lock_t lock;
omp_init_lock(&lock);
#pragma omp parallel num_threads(6)
{
int threadId = omp_get_thread_num();
if (threadId == 0)
x = 5;
else
printf("I am Thread %d and for me x=%d \n", threadId, x);
#pragma omp barrier
{
omp_set_lock(&lock);
printf("\n\nAfter lock => I am Thread %d\n", threadId);
printf("I am same thread %d\n", threadId);
omp_unset_lock(&lock);
}
}
}
void main(int count, char *arg[])
{
start();
}
No preview for this file type
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment