This post series is the Solutions of the Professional CUDA C Programming written by John Cheng, Max Grossman and Ty McKercher
Chapter 2: CUDA Programming Model
In this section, we will learn to
- Writing a CUDA Program
- Executing kernel functions
- Organizing the Cuda threads with grids and blocks
- Measuring the GPU performance
A typical processing flow of a CUDA program follows this pattern;
- Copy data from CPU memory to GPU memory
- Invoke kernels to operate on the data stored in GPU memory
- Copy data back from GPU memory to CPU memory
We will start to learning memory management and data movement between host and device.
STANDARD C FUNCTIONS | CUDA C FUNCTIONS |
---|---|
memset | cudaMemset |
memcpy | cudaMemcpy |
malloc | cudaMalloc |
free | cudaFree |
Example usage is cudaMalloc
1 |
cudaError_t cudaMalloc ( void** devPtr, size_t size ) |
Example usage is cudaMemcpy
1 |
cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind ) |
cudaMemcpyKind flags types are;
- cudaMemcpyHostToHost
- cudaMemcpyHostToDevice
- cudaMemcpyDeviceToHost
- cudaMemcpyDeviceToDevice
Sooooo, how we can manage data movement between the host and device, and how to program with CUDA C through a simple example of summing two arrays.
First, I start to simple C program without GPU.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 |
#include <stdlib.h> #include <time.h> #include <string.h> #include <stdio.h> void sumArrays(float * A, float * B, float * C, const int N){ for(int idx=0; idx<N; idx++) { C[idx] = A[idx] + B[idx]; } } void storeData(float * arr, int size){ time_t t; srand((unsigned int) time(&t)); for (int i = 0; i < size; i++){ arr[i] = (float)(rand() & 0xFF) / 10.0f; } } int main(){ int numbersofelements = 1024; size_t size = numbersofelements * sizeof(float); float *A, *B, *C; A = (float * )malloc(size); B = (float * )malloc(size); C = (float * )malloc(size); storeData(A, numbersofelements); storeData(B, numbersofelements); sumArrays(A,B,C,numbersofelements); free(A); free(B); free(C); return(0); } |
I change this code for the GPU computation
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
#include <stdlib.h> #include <time.h> #include <string.h> #include <stdio.h> __global__ void sumArrays(float * a, float * b, float * c, const int N){ int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < N) { c[i] = a[i] + b[i]; } } void storeData(float * arr, int size){ time_t t; srand((unsigned int) time(&t)); for (int i = 0; i < size; i++){ arr[i] = (float)(rand() & 0xFF) / 10.0f; } } int main(){ int numbersofelements = 10; size_t size = numbersofelements * sizeof(float); float *h_A, *h_B, *h_C; float *d_A, *d_B, *d_C; h_A = (float * )malloc(size); h_B = (float * )malloc(size); h_C = (float * )malloc(size); cudaMalloc((void **)&d_A,size); cudaMalloc((void **)&d_B,size); cudaMalloc((void **)&d_C,size); storeData(h_A, numbersofelements); storeData(h_B, numbersofelements); cudaMemcpy(d_A, h_A, size, cudaMemcpyDeviceToHost); cudaMemcpy(d_B, h_B, size, cudaMemcpyDeviceToHost); sumArrays<<<1, numbersofelements>>>(d_A,d_B,d_C,numbersofelements); cudaDeviceSynchronize(); cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost); for (int i = 0; i < numbersofelements; i++){ printf("%lf\n",h_C[i] ); } for (int i = 0; i < numbersofelements; i++){ printf("%lf\n",h_A[i] ); } for (int i = 0; i < numbersofelements; i++){ printf("%lf\n",h_B[i] ); } //sumArrays(A,B,C,numbersofelements); cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); free(h_A); free(h_B); free(h_C); cudaDeviceReset(); return(0); } |
Organizing Threads
In this section, we will learn using Cuda blocks and threads. Devices have Grid, Block and Thread. All threads spawned by single kernel launch are collectively called a grid. Threads in a grid share same memory space. Grid has many thread blocks. A thread block is a group of thread that can cooperate with each other using block-local synchronization and shared memory. Identification of thread is can be defined by using blockIdx and threadIdx. These variables have three different components. These are;
blockDim.x
blockDim.y
blockDim.z
treadIdx.x
treadIdx.y
treadIdx.z
We can use these variables with example code
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 |
#include <cuda_runtime.h> #include <stdio.h> __global__ void printIndex(void){ printf("threadIdx:(%d, %d, %d) blockIdx:(%d, %d, %d) blockDim:(%d, %d, %d) " "gridDim:(%d, %d, %d)\n", threadIdx.x, threadIdx.y, threadIdx.z, blockIdx.x, blockIdx.y, blockIdx.z, blockDim.x, blockDim.y, blockDim.z, gridDim.x,gridDim.y,gridDim.z); } int main(){ int NofEl = 10; dim3 block(5); dim3 grid ((NofEl+block.x-1)/block.x); printf("grid.x %d grid.y %d grid.z %d\n",grid.x,grid.y,grid.z ); printf("block.x %d block.y %d block.z %d\n",block.x,block.y,block.z ); printIndex<<<grid,block>>>(); cudaDeviceReset(); return 0; } |
I mean that deciding the grid and block size is important.
Launch a CUDA Kernel
Cuda syntax for calling functions is like that,
1 2 |
//ex/home/cemo/Desktop/Learn_CUDA/My_Solutions/Chapter2/vectorsum.cu kernel_function_name <<<grid,block>>>(arguments); |
If you recognize the above example which is a summation of the array on GPU is wrong. We will try to write again. And you can try to find the error.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
#include <cuda_runtime.h> #include <stdio.h> __global__ void sumArraysOnGpu(float *A, float *B, float *C, const int N) { int i = threadIdx.x; if (i < N) C[i] = A[i] + B[i]; } void sumArrays(float * A, float * B, float * C, const int N){ for(int idx=0; idx<N; idx++) { C[idx] = A[idx] + B[idx]; } } void storeData(float * arr, int size){ time_t t; srand((unsigned) time(&t)); for (int i = 0; i < size; i++){ arr[i] = (float)(rand() & 0xFF) / 10.0f; } } void checkResult(float *hostRef, float *gpuRef, const int N) { double epsilon = 1.0E-8; bool match = 1; for (int i=0; i<N; i++) { if (abs(hostRef[i] - gpuRef[i]) > epsilon) { match = 0; printf("Arrays do not match!\n"); printf("host %5.2f gpu %5.2f at current %d\n",hostRef[i],gpuRef[i],i); break; } } if (match) printf("Arrays match.\n\n"); } int main(int argc, char const *argv[]) { int DeviceID = 0; int numberElement = 32; cudaSetDevice(DeviceID); size_t numberBytes = numberElement * sizeof(float); float *host_A,*host_B,*cpu_Result,*gpu_Result; host_A = (float *)malloc(numberBytes); host_B = (float *)malloc(numberBytes); cpu_Result = (float *)malloc(numberBytes); gpu_Result = (float *)malloc(numberBytes); storeData(host_A,numberElement); storeData(host_B,numberElement); memset(cpu_Result,0,numberBytes); memset(gpu_Result,0,numberBytes); float *device_A, *device_B, *device_Result; cudaMalloc((float**)&device_A,numberBytes); cudaMalloc((float**)&device_B,numberBytes); cudaMalloc((float**)&device_Result,numberBytes); cudaMemcpy(device_A,host_A,numberBytes,cudaMemcpyHostToDevice); cudaMemcpy(device_B,host_B,numberBytes,cudaMemcpyHostToDevice); cudaMemcpy(device_Result, gpu_Result, numberBytes, cudaMemcpyHostToDevice); dim3 block (numberElement); dim3 grid (numberElement/block.x); printf("Execution configuration <<<%d, %d>>>\n",grid.x,block.x); sumArraysOnGpu<<<grid,block>>>(device_A,device_B,device_Result,numberElement); cudaMemcpy(gpu_Result, device_Result, numberBytes, cudaMemcpyDeviceToHost); sumArrays(host_A,host_B,cpu_Result,numberElement); checkResult(cpu_Result, gpu_Result, numberElement); cudaFree(device_A); cudaFree(device_B); cudaFree(device_Result); free(host_A); free(host_B); free(cpu_Result); free(gpu_Result); cudaDeviceReset(); return 0; } |
nvproc: Shows the timing/
Parallel Threads
In this section, we will try to implement the multidimensional grid and blocks.
- 2D grid with 2D blocks
- 1D grid with 1D blocks
- 2D grid with 1D blocks
You can index with using below code
1 2 3 |
// idx = threadIdx.x + blockIdx.x * blockDim.x idy = threadIdx.y + blockIdx.y * blockDim.y |
First, try to implement matrix summation with 2D grid and 2D blocks
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
#include <cuda_runtime.h> #include <stdio.h> #include <sys/time.h> #define CHECK(call) \ { \ const cudaError_t error = call; \ if (error != cudaSuccess) \ { \ fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__); \ fprintf(stderr, "code: %d, reason: %s\n", error, \ cudaGetErrorString(error)); \ } \ } void sumMatrixOnCPU(float *A, float *B, float *C, const int nx, const int ny) { float *ia = A; float *ib = B; float *ic = C; for (int iy = 0; iy < ny; iy++) { for (int ix = 0; ix < nx; ix++) { ic[ix] = ia[ix] + ib[ix]; } ia += nx; ib += nx; ic += nx; } return; } void store_data(float *arr, const int size) { int i; for(i = 0; i < size; i++){ arr[i] = (float)(rand() & 0xFF) / 10.0f; } return; } void checkResult(float *hostRef, float *gpuRef, const int N) { double epsilon = 1.0E-8; bool match = 1; for (int i = 0; i < N; i++) { if (abs(hostRef[i] - gpuRef[i]) > epsilon) { match = 0; printf("host %f gpu %f index %d \n", hostRef[i], gpuRef[i], i); break; } } if (match) printf("Arrays match.\n\n"); else printf("Arrays do not match.\n\n"); } inline double calculate_seconds() { struct timeval tp; struct timezone tzp; int i = gettimeofday(&tp, &tzp); return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6); } __global__ void matrix_sum(float *MatA, float *MatB, float *MatC, int nx, int ny) { unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x; unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y; unsigned int idx = iy * nx + ix; if (ix < nx && iy < ny) MatC[idx] = MatA[idx] + MatB[idx]; } int main(int argc, char const *argv[]) { printf("%s Program is starting\n", argv[0]); int DeviceID = 0; cudaDeviceProp deviceProp; int nx = 1 << 14; int ny = 1 << 14; int nxy = nx*ny; int nBytes = nxy * sizeof(float); double iStart,iEnd; float *host_A, *host_B, *cpu_result, *gpu_result; float *device_A, *device_B, *device_result; int dimx = 32; int dimy = 32; dim3 block (dimx,dimy); dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y); host_A = (float *)malloc(nBytes); host_B = (float *)malloc(nBytes); gpu_result = (float *)malloc(nBytes); cpu_result = (float *)malloc(nBytes); memset(cpu_result,0,nxy); memset(gpu_result,0,nxy); iStart = calculate_seconds(); store_data(host_A,nxy); store_data(host_B,nxy); iEnd = calculate_seconds() - iStart; printf("Time of store_data %lf\n",iEnd); iStart = calculate_seconds(); sumMatrixOnCPU(host_A, host_B, cpu_result, nx, ny); iEnd = calculate_seconds() - iStart; printf("Time of sumMatrixOnCPU %f \n", iEnd); CHECK(cudaGetDeviceProperties(&deviceProp,DeviceID)); CHECK(cudaSetDevice(DeviceID)); printf("Using Device %d: %s\n", DeviceID, deviceProp.name); CHECK(cudaMalloc((void **)&device_A ,nBytes)); CHECK(cudaMalloc((void **)&device_B ,nBytes)); CHECK(cudaMalloc((void **)&device_result,nBytes)); CHECK(cudaMemcpy(device_A,host_A,nBytes,cudaMemcpyHostToDevice)); CHECK(cudaMemcpy(device_B,host_B,nBytes,cudaMemcpyHostToDevice)); iStart = calculate_seconds(); matrix_sum<<<grid,block >>>(device_A,device_B,device_result,nx,ny); CHECK(cudaDeviceSynchronize()); iEnd = calculate_seconds() - iStart; printf("Time of matrix_sum %lf\n",iEnd); CHECK(cudaMemcpy(gpu_result,device_result,nBytes,cudaMemcpyDeviceToHost)); checkResult(cpu_result,gpu_result,nxy); CHECK(cudaFree(device_A)); CHECK(cudaFree(device_B)); CHECK(cudaFree(device_result)); free(host_A); free(host_B); free(cpu_result); free(gpu_result); CHECK(cudaDeviceReset()); return(0); } |
Manage Devices
In this section, we will try to reach information about the devices on C code.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
#include <cuda_runtime.h> #include <stdio.h> #define CHECK(call) \ { \ const cudaError_t error = call; \ if (error != cudaSuccess) \ { \ fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__); \ fprintf(stderr, "code: %d, reason: %s\n", error, \ cudaGetErrorString(error)); \ } \ } int main(int argc, char **argv) { printf("%s Starting...\n", argv[0]); int deviceCount = 0; cudaGetDeviceCount(&deviceCount); if (deviceCount == 0) { printf("There are no available device(s) that support CUDA\n"); } else { printf("Detected %d CUDA Capable device(s)\n", deviceCount); } int dev = 0, driverVersion = 0, runtimeVersion = 0; CHECK(cudaSetDevice(dev)); cudaDeviceProp deviceProp; CHECK(cudaGetDeviceProperties(&deviceProp, dev)); printf("Device %d: \"%s\"\n", dev, deviceProp.name); cudaDriverGetVersion(&driverVersion); cudaRuntimeGetVersion(&runtimeVersion); printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driverVersion / 1000, (driverVersion % 100) / 10, runtimeVersion / 1000, (runtimeVersion % 100) / 10); printf(" CUDA Capability Major/Minor version number: %d.%d\n", deviceProp.major, deviceProp.minor); printf(" Total amount of global memory: %.2f MBytes (%llu " "bytes)\n", (float)deviceProp.totalGlobalMem / pow(1024.0, 3), (unsigned long long)deviceProp.totalGlobalMem); printf(" GPU Clock rate: %.0f MHz (%0.2f " "GHz)\n", deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f); printf(" Memory Clock rate: %.0f Mhz\n", deviceProp.memoryClockRate * 1e-3f); printf(" Memory Bus Width: %d-bit\n", deviceProp.memoryBusWidth); if (deviceProp.l2CacheSize) { printf(" L2 Cache Size: %d bytes\n", deviceProp.l2CacheSize); } printf(" Max Texture Dimension Size (x,y,z) 1D=(%d), " "2D=(%d,%d), 3D=(%d,%d,%d)\n", deviceProp.maxTexture1D, deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1], deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]); printf(" Max Layered Texture Size (dim) x layers 1D=(%d) x %d, " "2D=(%d,%d) x %d\n", deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1], deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]); printf(" Total amount of constant memory: %lu bytes\n", deviceProp.totalConstMem); printf(" Total amount of shared memory per block: %lu bytes\n", deviceProp.sharedMemPerBlock); printf(" Total number of registers available per block: %d\n", deviceProp.regsPerBlock); printf(" Warp size: %d\n", deviceProp.warpSize); printf(" Maximum number of threads per multiprocessor: %d\n", deviceProp.maxThreadsPerMultiProcessor); printf(" Maximum number of threads per block: %d\n", deviceProp.maxThreadsPerBlock); printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n", deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]); printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]); printf(" Maximum memory pitch: %lu bytes\n", deviceProp.memPitch); exit(EXIT_SUCCESS); } |
I solve the examples of Chapter 2. You can find solutions at bellow. I use Jetson Xavier.
Example 1
I changed sumArraysOnGPU-timer.cu . I change block.x variable 1024 to 1023 and results are below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
./a.out Starting... Using Device 0: Xavier Vector size 16777216 initialData Time elapsed 1.336892 sec sumArraysOnHost Time elapsed 0.057787 sec sumArraysOnGPU <<< 16384, 1024 >>> Time elapsed 0.012352 sec Arrays match. ////////////////////////////////////////////////////////////// ./a.out Starting... Using Device 0: Xavier Vector size 16777216 initialData Time elapsed 1.329670 sec sumArraysOnHost Time elapsed 0.057049 sec sumArraysOnGPU <<< 16401, 1023 >>> Time elapsed 0.012248 sec Arrays match. |
Example 2
I changed sumArraysOnGPU-timer.cu . I change block.x variable to 256 and results and new kernel codes are below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 |
__global__ void sumArraysOnGPU(float *A, float *B, float *C, const int N) { int nthreads = gridDim.x * blockDim.x; int i = blockIdx.x * blockDim.x + threadIdx.x; int j = i + nthreads; if (i < N) C[i] = A[i] + B[i]; if(j<N)C[j] = A[j] + B[j]; } //dim3 grid is must be changed dim3 grid((nElem + block.x - 1) / block.x); //to dim3 grid(((nElem / 2) + block.x - 1) / block.x); |
Results
1 2 3 4 5 6 7 |
./a.out Starting... Using Device 0: Xavier Vector size 16777216 initialData Time elapsed 1.311135 sec sumArraysOnHost Time elapsed 0.057915 sec sumArraysOnGPU <<< 32768, 256 >>> Time elapsed 0.008780 sec Arrays match. |
Example 3
I changed sumMatrixOnGPU-2D-grid-2D-block.cu . I change block.x and block.y variables results and new kernel codes are below.
Block.x | Block.y | Kernel Time |
---|---|---|
1024 | 1 | 0.036771 s |
512 | 1 | 0.035524 s |
128 | 2 | 0.037406 s |
128 | 8 | 0.034023 s |
32 | 32 | 0.034817 s |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
#include "common.h" #include <cuda_runtime.h> #include <stdio.h> void initialData(int *ip, const int size) { int i; for(i = 0; i < size; i++) { ip[i] = (int)(rand() & 0xFF); } return; } void sumMatrixOnHost(int *A, int *B, int *C, const int nx, const int ny) { int *ia = A; int *ib = B; int *ic = C; for (int iy = 0; iy < ny; iy++) { for (int ix = 0; ix < nx; ix++) { ic[ix] = ia[ix] + ib[ix]; } ia += nx; ib += nx; ic += nx; } return; } void checkResult(int *hostRef, int *gpuRef, const int N) { bool match = 1; for (int i = 0; i < N; i++) { if (hostRef[i] != gpuRef[i]) { match = 0; printf("host %d gpu %d\n", hostRef[i], gpuRef[i]); break; } } if (match) printf("Arrays match.\n\n"); else printf("Arrays do not match.\n\n"); } // grid 2D block 2D __global__ void sumMatrixOnGPU2D(int *MatA, int *MatB, int *MatC, int nx, int ny) { unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x; unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y; unsigned int idx = iy * nx + ix; if (ix < nx && iy < ny) MatC[idx] = MatA[idx] + MatB[idx]; } int main(int argc, char **argv) { printf("%s Starting...\n", argv[0]); // set up device int dev = 0; cudaDeviceProp deviceProp; CHECK(cudaGetDeviceProperties(&deviceProp, dev)); printf("Using Device %d: %s\n", dev, deviceProp.name); CHECK(cudaSetDevice(dev)); // set up data size of matrix int nx = 1 << 13; int ny = 1 << 13; int nxy = nx * ny; int nBytes = nxy * sizeof(int); printf("Matrix size: nx %d ny %d\n", nx, ny); // malloc host memory int *h_A, *h_B, *hostRef, *gpuRef; h_A = (int *)malloc(nBytes); h_B = (int *)malloc(nBytes); hostRef = (int *)malloc(nBytes); gpuRef = (int *)malloc(nBytes); // initialize data at host side double iStart = seconds(); initialData(h_A, nxy); initialData(h_B, nxy); double iElaps = seconds() - iStart; printf("Matrix initialization elapsed %f sec\n", iElaps); memset(hostRef, 0, nBytes); memset(gpuRef, 0, nBytes); // add matrix at host side for result checks iStart = seconds(); sumMatrixOnHost(h_A, h_B, hostRef, nx, ny); iElaps = seconds() - iStart; printf("sumMatrixOnHost elapsed %f sec\n", iElaps); // malloc device global memory int *d_MatA, *d_MatB, *d_MatC; CHECK(cudaMalloc((void **)&d_MatA, nBytes)); CHECK(cudaMalloc((void **)&d_MatB, nBytes)); CHECK(cudaMalloc((void **)&d_MatC, nBytes)); // transfer data from host to device CHECK(cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice)); CHECK(cudaMemcpy(d_MatB, h_B, nBytes, cudaMemcpyHostToDevice)); // invoke kernel at host side int dimx = 32; int dimy =32; dim3 block(dimx, dimy); dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y); iStart = seconds(); sumMatrixOnGPU2D<<<grid, block>>>(d_MatA, d_MatB, d_MatC, nx, ny); CHECK(cudaDeviceSynchronize()); iElaps = seconds() - iStart; printf("sumMatrixOnGPU2D <<<(%d,%d), (%d,%d)>>> elapsed %f sec\n", grid.x, grid.y, block.x, block.y, iElaps); // check kernel error CHECK(cudaGetLastError()); // copy kernel result back to host side CHECK(cudaMemcpy(gpuRef, d_MatC, nBytes, cudaMemcpyDeviceToHost)); // check device results checkResult(hostRef, gpuRef, nxy); // free device global memory CHECK(cudaFree(d_MatA)); CHECK(cudaFree(d_MatB)); CHECK(cudaFree(d_MatC)); // free host memory free(h_A); free(h_B); free(hostRef); free(gpuRef); // reset device CHECK(cudaDeviceReset()); return (0); } |
Example 4
I changed sumMatrixOnGPU-2D-grid-1D-block.cu . I change block.x variables results and new kernel codes are below.
Block.x | Kernel Time (s) |
---|---|
32 | 0.333858 s |
64 | 0.167834 s |
128 | 0.111178 s |
256 | 0.115275 s |
512 | 0.112036 s |
1024 | 0.139362 s |
The best result is 128 block number.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
#include "common.h" #include <cuda_runtime.h> #include <stdio.h> void initialData(float *ip, const int size) { int i; for(i = 0; i < size; i++) { ip[i] = (float)(rand() & 0xFF) / 10.0f; } return; } void sumMatrixOnHost(float *A, float *B, float *C, const int nx, const int ny) { float *ia = A; float *ib = B; float *ic = C; for (int iy = 0; iy < ny; iy++) { for (int ix = 0; ix < nx; ix++) { ic[ix] = ia[ix] + ib[ix]; } ia += nx; ib += nx; ic += nx; } return; } void checkResult(float *hostRef, float *gpuRef, const int N) { double epsilon = 1.0E-8; bool match = 1; for (int i = 0; i < N; i++) { if (abs(hostRef[i] - gpuRef[i]) > epsilon) { match = 0; printf("host %f gpu %f\n", hostRef[i], gpuRef[i]); break; } } if (match) printf("Arrays match.\n\n"); else printf("Arrays do not match.\n\n"); } // grid 2D block 1D __global__ void sumMatrixOnGPUMix(float *MatA, float *MatB, float *MatC, int nx, int ny) { unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x; unsigned int iy = blockIdx.y; unsigned int idx = iy * nx + ix; if (ix < nx && iy < ny) MatC[idx] = MatA[idx] + MatB[idx]; } int main(int argc, char **argv) { printf("%s Starting...\n", argv[0]); // set up device int dev = 0; cudaDeviceProp deviceProp; CHECK(cudaGetDeviceProperties(&deviceProp, dev)); printf("Using Device %d: %s\n", dev, deviceProp.name); CHECK(cudaSetDevice(dev)); // set up data size of matrix int nx = 1 << 14; int ny = 1 << 14; int nxy = nx * ny; int nBytes = nxy * sizeof(float); printf("Matrix size: nx %d ny %d\n", nx, ny); // malloc host memory float *h_A, *h_B, *hostRef, *gpuRef; h_A = (float *)malloc(nBytes); h_B = (float *)malloc(nBytes); hostRef = (float *)malloc(nBytes); gpuRef = (float *)malloc(nBytes); // initialize data at host side double iStart = seconds(); initialData(h_A, nxy); initialData(h_B, nxy); double iElaps = seconds() - iStart; printf("Matrix initialization elapsed %f sec\n", iElaps); memset(hostRef, 0, nBytes); memset(gpuRef, 0, nBytes); // add matrix at host side for result checks iStart = seconds(); sumMatrixOnHost(h_A, h_B, hostRef, nx, ny); iElaps = seconds() - iStart; printf("sumMatrixOnHost elapsed %f sec\n", iElaps); // malloc device global memory float *d_MatA, *d_MatB, *d_MatC; CHECK(cudaMalloc((void **)&d_MatA, nBytes)); CHECK(cudaMalloc((void **)&d_MatB, nBytes)); CHECK(cudaMalloc((void **)&d_MatC, nBytes)); // transfer data from host to device CHECK(cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice)); CHECK(cudaMemcpy(d_MatB, h_B, nBytes, cudaMemcpyHostToDevice)); // invoke kernel at host side int dimx = 1024; dim3 block(dimx, 1); dim3 grid((nx + block.x - 1) / block.x, ny); iStart = seconds(); sumMatrixOnGPUMix<<<grid, block>>>(d_MatA, d_MatB, d_MatC, nx, ny); CHECK(cudaDeviceSynchronize()); iElaps = seconds() - iStart; printf("sumMatrixOnGPU2D <<<(%d,%d), (%d,%d)>>> elapsed %f sec\n", grid.x, grid.y, block.x, block.y, iElaps); // check kernel error CHECK(cudaGetLastError()); // copy kernel result back to host side CHECK(cudaMemcpy(gpuRef, d_MatC, nBytes, cudaMemcpyDeviceToHost)); // check device results checkResult(hostRef, gpuRef, nxy); // free device global memory CHECK(cudaFree(d_MatA)); CHECK(cudaFree(d_MatB)); CHECK(cudaFree(d_MatC)); // free host memory free(h_A); free(h_B); free(hostRef); free(gpuRef); // reset device CHECK(cudaDeviceReset()); return (0); } |