Parallel Programming in CUDA C
Nowadays, I try to learn CUDA. I follow some books for that. In this post, I use CUDA BY EXAMPLE An introduction to General-Purpose GPU Programming book which is written by Jason Sanders and Edward Kandrot.
In this section, I try to learn parallel programming in CUDA C. First I try to implement vector summation.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
#include <stdio.h> #define N 1024 __global__ void sum( int *a, int *b, int *c ) { int tid = blockIdx.x; // this thread handles the data at its thread id if (tid < N) c[tid] = a[tid] + b[tid]; } int main(int argc, char const *argv[]) { int *host_A,*host_B,*host_C; int *device_A,*device_B,*device_C; host_A = (int*)malloc(N*sizeof(int)); host_B = (int*)malloc(N*sizeof(int)); host_C = (int*)malloc(N*sizeof(int)); cudaMalloc((void**)&device_A,N*sizeof(int)); cudaMalloc((void**)&device_B,N*sizeof(int)); cudaMalloc((void**)&device_C,N*sizeof(int)); for(int i = 0; i<N; i++){ host_A[i] = i; host_B[i] = N-i; } cudaMemcpy(device_A,host_A,N*sizeof(int),cudaMemcpyHostToDevice); cudaMemcpy(device_B,host_B,N*sizeof(int),cudaMemcpyHostToDevice); sum<<<N,1>>>(device_A,device_B,device_C); cudaMemcpy(host_C,device_C,N*sizeof(int),cudaMemcpyDeviceToHost); bool success = true; for (int i=0; i<N; i++) { if ((host_A[i] + host_B[i]) != host_C[i]) { printf( "Error: %d + %d != %d\n", host_A[i], host_B[i], host_C[i] ); success = false; } } if (success) printf( "Correct!\n" ); cudaFree(device_A); cudaFree(device_B); cudaFree(device_C); free(host_A); free(host_B); free(host_C); return 0; } |
But this example is in everywhere on the internet. We can try a different example. In the book, I saw the Julia set visualization we can try to reimplement that example.
Julia Set Example:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
#include <stdio.h> #include "../Include/cpu_bitmap.h" #define Nx 1920/2 #define Ny 1080/2 struct cuComplex { float r; float i; // cuComplex( float a, float b ) : r(a), i(b) {} __device__ cuComplex( float a, float b ) : r(a), i(b) {} // Fix error for calling host function from device __device__ float magnitude2( void ) { return r * r + i * i; } __device__ cuComplex operator*(const cuComplex& a) { return cuComplex(r*a.r - i*a.i, i*a.r + r*a.i); } __device__ cuComplex operator+(const cuComplex& a) { return cuComplex(r+a.r, i+a.i); } }; __device__ int julia( int x, int y ) { const float scale = 1.5; float jx = scale * (float)(Nx/2 - x)/(Nx/2); float jy = scale * (float)(Ny/2 - y)/(Ny/2); cuComplex c(-0.8, 0.156); cuComplex a(jx, jy); int i = 0; for (i=0; i<200; i++) { a = a * a + c; if (a.magnitude2() > 1000) return 0; } return 1; } __global__ void change_bitmap( unsigned char *ptr ) { // map from blockIdx to pixel position int x = blockIdx.x; int y = blockIdx.y; int offset = x + y * gridDim.x; // now calculate the value at that position int juliaValue = julia( x, y ); ptr[offset*4 + 0] = 255 * juliaValue; ptr[offset*4 + 1] = 0; ptr[offset*4 + 2] = 0; ptr[offset*4 + 3] = 255; } int main(int argc, char const *argv[]) { CPUBitmap host_bitmap(Nx,Ny); unsigned char *device_bitmap; cudaMalloc((void **)&device_bitmap,host_bitmap.image_size()); dim3 grid(Nx,Ny); change_bitmap<<<grid,1>>>(device_bitmap); cudaMemcpy(host_bitmap.get_ptr(),device_bitmap,host_bitmap.image_size(),cudaMemcpyDeviceToHost); cudaFree(device_bitmap); host_bitmap.display_and_exit(); return 0; } |
Result: