Thread Cooperation
Nowadays, I try to learn CUDA. I follow some books for that. In this post, I use CUDA BY EXAMPLE An introduction to General-Purpose GPU Programming book which is written by Jason Sanders and Edward Kandrot.
In this section, I try to learn Thread cooperation topic. First I will try to recode big vectors summations.
Vector Summation
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
#include <stdio.h> #define N (1024*1024) __global__ void sum( int *a, int *b, int *c ) { int tid = threadIdx.x + blockIdx.x*blockDim.x; if (tid < N) c[tid] = a[tid] + b[tid]; } int main(int argc, char const *argv[]) { int *host_A,*host_B,*host_C; int *device_A,*device_B,*device_C; host_A = (int*)malloc(N*sizeof(int)); host_B = (int*)malloc(N*sizeof(int)); host_C = (int*)malloc(N*sizeof(int)); cudaMalloc((void**)&device_A,N*sizeof(int)); cudaMalloc((void**)&device_B,N*sizeof(int)); cudaMalloc((void**)&device_C,N*sizeof(int)); for(int i = 0; i<N; i++){ host_A[i] = i; host_B[i] = N-i; } cudaMemcpy(device_A,host_A,N*sizeof(int),cudaMemcpyHostToDevice); cudaMemcpy(device_B,host_B,N*sizeof(int),cudaMemcpyHostToDevice); sum<<<1024,1024>>>(device_A,device_B,device_C); cudaMemcpy(host_C,device_C,N*sizeof(int),cudaMemcpyDeviceToHost); bool success = true; for (int i=0; i<N; i++) { if ((host_A[i] + host_B[i]) != host_C[i]) { printf( "Error: %d + %d != %d\n", host_A[i], host_B[i], host_C[i] ); success = false; } } if (success) printf( "Correct!\n" ); cudaFree(device_A); cudaFree(device_B); cudaFree(device_C); free(host_A); free(host_B); free(host_C); return 0; } |
GPU Ripple Using thread:
In this part, I will share the Gpu Ripple code with using CUDA.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
//CEMALETTIN YILMAZ #include <stdio.h> #include "cuda.h" #include "../Include/cpu_anim.h" #define Nx 1024 #define Ny 1024 __global__ void ripple(unsigned char *data, int counter){ int x = threadIdx.x + blockIdx.x * blockDim.x; int y = threadIdx.y + blockIdx.y * blockDim.y; int offset = x + y * blockDim.x * gridDim.x; float fx = x - Nx/2; float fy = y - Ny/2; float d = sqrtf(fx*fx+fy*fy); unsigned char value = (unsigned char)(128.0f + 127.0f * cos(d/10.0f - counter/7.0f) / (d/10.0f + 1.0f)); data[offset*4+0] = 0; data[offset*4+1] = 0; data[offset*4+2] = value; data[offset*4+3] = 255; } struct DataBlock{ unsigned char *device_bitmap; CPUAnimBitmap *anim_bitmap; }; void generate_frame(DataBlock *d,int ticks){ dim3 blocks(Nx/16,Ny/16); dim3 threads(16,16); ripple<<<blocks,threads>>>(d->device_bitmap,ticks); cudaMemcpy(d->anim_bitmap->get_ptr(),d->device_bitmap,d->anim_bitmap->image_size(),cudaMemcpyDeviceToHost); } void cleanup(DataBlock *data){ cudaFree(data->device_bitmap); } int main(int argc, char const *argv[]) { DataBlock data; CPUAnimBitmap bitmap(Nx,Ny,&data); data.anim_bitmap = &bitmap; cudaMalloc((void**)&data.device_bitmap,bitmap.image_size()); bitmap.anim_and_exit((void(*)(void*,int))generate_frame,(void(*)(void*))cleanup); return 0; } |
Output: