1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101
| #include <cuda_runtime.h> #include <iostream> #include <iomanip> #include <cstdlib> #include <cstdio> #include <ctime> #include <windows.h>
using std::cout; using std::endl; typedef float calc_type;
void randomize(calc_type * array, int len) { for(int i = 0 ; i < len ; i ++) array[i] = rand()/((calc_type)RAND_MAX); }
void cpuAddition(calc_type * a, calc_type *b, calc_type *c, int len) { for(int i = 0 ; i < len ; i ++) c[i] = a[i] + b[i]; }
__global__ void gpuAddition(calc_type * a, calc_type *b, calc_type *c) { int idx = threadIdx.x + blockIdx.x * 1024; c[idx] = a[idx] + b[idx]; }
__global__ void helloWorldMulti() { printf("Hello World from gpu thread %d grid %d\n", threadIdx.x, blockIdx.x); }
const int arrSize = 1024 * 1024 * 128;
int main(int argc, char ** argv) { srand(time(NULL)); cudaDeviceProp device; cudaGetDeviceProperties(&device, 0); int driver, runtime; cudaDriverGetVersion(&driver); cudaRuntimeGetVersion(&runtime); cout << "Device : \"" << device.name << "\"" << endl; cout << " CUDA Runtime Version : " << runtime / 1000 << "." << (runtime % 100) / 10 << endl; cout << " Device CUDA Capability : " << device.major << "." << device.minor << endl; cout << " Memory : " << (float)device.totalGlobalMem / pow(1024.0, 3) << " GigaByte(s)" << endl; cout << " Constant Memory : " << device.totalConstMem << " Byte(s)" << endl; cout << " L2 Cache Size : " << device.l2CacheSize << " Byte(s)" << endl; cout << " GPU Clock Rate : " << device.clockRate / 1000 << " MHz" << endl; cout << " Memory Clock Rate : " << device.memoryClockRate / 1000 << " MHz" << endl; cout << " Memory Bus Width : " << device.memoryBusWidth << "-bit" << endl; cout << " Shared Memory per Block : " << device.sharedMemPerBlock << " Byte(s)" << endl; cout << " Warp Size : " << device.warpSize << endl; cout << " Maximum threads per block : " << device.maxThreadsPerBlock << endl; cout << " Maximum Dimensions of block : (" << device.maxThreadsDim[0] << ", " << device.maxThreadsDim[1] << ", " << device.maxThreadsDim[2] << ")" << endl; cout << " Maximum Dimensions of grid : (" << device.maxGridSize[0] << ", " << device.maxGridSize[1] << ", " << device.maxGridSize[2] << ")" << endl; cout << endl << endl << "Multi-thread tests :" << endl; helloWorldMulti<<<2, 2>>>(); cudaDeviceSynchronize(); getchar(); cout << endl << "Addition tests : " << endl; calc_type *a, *b, *c; a = (calc_type *)malloc(sizeof(calc_type) * arrSize); b = (calc_type *)malloc(sizeof(calc_type) * arrSize); c = (calc_type *)malloc(sizeof(calc_type) * arrSize); randomize(a, arrSize); randomize(b, arrSize); calc_type *ga, *gb, *gc, *gr; gr = (calc_type *)malloc(sizeof(calc_type) * arrSize); cudaMalloc(&ga, sizeof(calc_type) * arrSize); cudaMalloc(&gb, sizeof(calc_type) * arrSize); cudaMalloc(&gc, sizeof(calc_type) * arrSize); DWORD c1, c2, g1, g2; c1 = GetTickCount(); cpuAddition(a, b, c, arrSize); c2 = GetTickCount(); g1 = GetTickCount(); cudaMemcpy(ga, a, sizeof(calc_type) * arrSize, cudaMemcpyHostToDevice); cudaMemcpy(gb, b, sizeof(calc_type) * arrSize, cudaMemcpyHostToDevice); gpuAddition<<<arrSize / 1024, 1024>>>(ga, gb, gc); cudaDeviceSynchronize(); g2 = GetTickCount(); cudaMemcpy(gr, gc, sizeof(calc_type) * arrSize, cudaMemcpyDeviceToHost); cudaFree(ga); cudaFree(gb); cudaFree(gc); int errors = 0; for(int i = 0 ; i < arrSize ; i ++ ) if( fabs(gr[i] - c[i]) > 1e-6 ) cout << gr[i] << ' ' << c[i] << endl; cout << errors << " Error(s) found" << endl; cout << "CPU Time Consumption : " << c2 << " -> " << c1 << " = " << (c2 - c1) << " ms" << endl; cout << "GPU Time Consumption : " << g2 << " -> " << g1 << " = " << (g2 - g1) << " ms" << endl; cudaDeviceReset(); return 0; }
|