1. 介绍:
(1) 用CUDA计算 pow(sin(id),2)+ pow(cos(id),2)的结果
(2) 对比单流(同步传输、异步传输)、多流深度优先调度、多流广度优先调度的效率(包含数据传输和计算)
核心代码
1. 用CUDA计算 pow(sin(id),2)+ pow(cos(id),2)的结果
2. 对比单流(同步传输、异步传输)、多流深度优先调度、多流广度优先调度的效率(包含数据传输和计算)
3. 使用接口错误检查宏
*/#include <stdio.h>#define CUDA_ERROR_CHECK //API检查控制宏#define BLOCKSIZE 256
int N = 1<<28; //数据个数
int NBytes = N*sizeof(float); //数据字节数//宏定义检查API调用是否出错
#define CudaSafecCall(err) __cudaSafeCall(err,__FILE__,__LINE__)
inline void __cudaSafeCall(cudaError_t err,const char* file,const int line)
{#ifdef CUDA_ERROR_CHECKif(err!=cudaSuccess){fprintf(stderr,"cudaSafeCall failed at %s:%d :(%d) %s\n",file,line,err,cudaGetErrorString(err));exit(-1);}#endif
}//宏定义检查获取流中的执行错误,主要是对核函数
#define CudaCheckError() _cudaCheckError(__FILE__,__LINE__)
inline void _cudaCheckError(const char * file,const int line)
{#ifdef CUDA_ERROR_CHECKcudaError_t err = cudaGetLastError();if(err != cudaSuccess){fprintf(stderr,"cudaCheckError failed at %s:%d :(%d) %s\n",file,line,err,cudaGetErrorString(err));exit(-1);}#endif
}__global__ void kernel_func(float * arr,int offset,const int n)
{int id = offset + threadIdx.x + blockIdx.x * blockDim.x;if(id<n)arr[id] = pow(sinf(id),2) + pow(cosf(id),2);
}//单流主机非锁页内存,同步传输
float gpu_base()
{//开辟主机非锁页内存空间float* hostA,*deviceA;hostA = (float*)calloc(N,sizeof(float));CudaSafecCall(cudaMalloc((void**)&deviceA,NBytes));float gpuTime = 0.0;cudaEvent_t start,end;CudaSafecCall(cudaEventCreate(&start));CudaSafecCall(cudaEventCreate(&end));CudaSafecCall(cudaEventRecord(start));CudaSafecCall(cudaMemcpy(deviceA,hostA,NBytes,cudaMemcpyHostToDevice));kernel_func<<<(N-1)/BLOCKSIZE + 1,BLOCKSIZE>>>(deviceA,0,N);CudaCheckError();CudaSafecCall(cudaEventRecord(end));CudaSafecCall(cudaEventSynchronize(end));CudaSafecCall(cudaEventElapsedTime(&gpuTime,start,end));CudaSafecCall(cudaEventDestroy(start));CudaSafecCall(cudaEventDestroy(end));CudaSafecCall(cudaMemcpy(hostA,deviceA,NBytes,cudaMemcpyDeviceToHost));printf("gpu_base 单流非锁页内存,数据传输和计算耗时 %f ms\n",gpuTime);CudaSafecCall(cudaFree(deviceA));free(hostA);return gpuTime;
}//单流主机锁页内存,异步传输
float gpu_base_pinMem()
{//开辟主机非锁页内存空间float* hostA,*deviceA;CudaSafecCall(cudaMallocHost((void**)&hostA,NBytes));CudaSafecCall(cudaMalloc((void**)&deviceA,NBytes));float gpuTime = 0.0;cudaEvent_t start,end;CudaSafecCall(cudaEventCreate(&start));CudaSafecCall(cudaEventCreate(&end));CudaSafecCall(cudaEventRecord(start));CudaSafecCall(cudaMemcpy(deviceA,hostA,NBytes,cudaMemcpyHostToDevice));kernel_func<<<(N-1)/BLOCKSIZE + 1,BLOCKSIZE>>>(deviceA,0,N);CudaCheckError();CudaSafecCall(cudaEventRecord(end));CudaSafecCall(cudaEventSynchronize(end));CudaSafecCall(cudaEventElapsedTime(&gpuTime,start,end));CudaSafecCall(cudaEventDestroy(start));CudaSafecCall(cudaEventDestroy(end));CudaSafecCall(cudaMemcpy(hostA,deviceA,NBytes,cudaMemcpyDeviceToHost));printf("gpu_base_pinMem 单流锁页内存,数据传输和计算耗时 %f ms\n",gpuTime);CudaSafecCall(cudaFreeHost(hostA));CudaSafecCall(cudaFree(deviceA));return gpuTime;
}//多流深度优先调度
float gpu_MStream_deep(int nStreams)
{//开辟主机非锁页内存空间float* hostA,*deviceA;//异步传输必须用锁页主机内存CudaSafecCall(cudaMallocHost((void**)&hostA,NBytes));CudaSafecCall(cudaMalloc((void**)&deviceA,NBytes));float gpuTime = 0.0;cudaEvent_t start,end;cudaStream_t* streams = (cudaStream_t*)calloc(nStreams,sizeof(cudaStream_t));for(int i=0;i<nStreams;i++)CudaSafecCall(cudaStreamCreate(streams+i));CudaSafecCall(cudaEventCreate(&start));CudaSafecCall(cudaEventCreate(&end));CudaSafecCall(cudaEventRecord(start));//传输、计算,流间最多只有一个传输和一个计算同时进行// 每个流处理的数据量int nByStream = N/nStreams;for(int i=0;i<nStreams;i++){int offset = i * nByStream;CudaSafecCall(cudaMemcpyAsync(deviceA+offset,hostA+offset,nByStream*sizeof(float),cudaMemcpyHostToDevice,streams[i]));kernel_func<<<(nByStream-1)/BLOCKSIZE + 1,BLOCKSIZE,0,streams[i]>>>(deviceA,offset,(i+1)*nByStream);CudaCheckError();CudaSafecCall(cudaMemcpyAsync(hostA+offset,deviceA+offset,nByStream*sizeof(float),cudaMemcpyDeviceToHost,streams[i]));}for(int i=0;i<nStreams;i++)CudaSafecCall(cudaStreamSynchronize(streams[i]));CudaSafecCall(cudaEventRecord(end));CudaSafecCall(cudaEventSynchronize(end));CudaSafecCall(cudaEventElapsedTime(&gpuTime,start,end));CudaSafecCall(cudaEventDestroy(start));CudaSafecCall(cudaEventDestroy(end));printf("gpu_MStream_deep %d个流深度优先调度,数据传输和计算耗时 %f ms\n",nStreams,gpuTime);for(int i=0;i<nStreams;i++)CudaSafecCall(cudaStreamDestroy(streams[i]));CudaSafecCall(cudaFreeHost(hostA));CudaSafecCall(cudaFree(deviceA));free(streams);return gpuTime;
}//多流广度优先调度
float gpu_MStream_wide(int nStreams)
{//开辟主机非锁页内存空间float* hostA,*deviceA;//异步传输必须用锁页主机内存CudaSafecCall(cudaMallocHost((void**)&hostA,NBytes));CudaSafecCall(cudaMalloc((void**)&deviceA,NBytes));float gpuTime = 0.0;cudaEvent_t start,end;cudaStream_t* streams = (cudaStream_t*)calloc(nStreams,sizeof(cudaStream_t));for(int i=0;i<nStreams;i++)CudaSafecCall(cudaStreamCreate(streams+i));CudaSafecCall(cudaEventCreate(&start));CudaSafecCall(cudaEventCreate(&end));CudaSafecCall(cudaEventRecord(start));//传输、计算,流间并行// 每个流处理的数据量int nByStream = N/nStreams;for(int i=0;i<nStreams;i++){int offset = i * nByStream;CudaSafecCall(cudaMemcpyAsync(deviceA+offset,hostA+offset,nByStream*sizeof(float),cudaMemcpyHostToDevice,streams[i]));}for(int i=0;i<nStreams;i++){int offset = i * nByStream;kernel_func<<<(nByStream-1)/BLOCKSIZE + 1,BLOCKSIZE,0,streams[i]>>>(deviceA,offset,(i+1)*nByStream);CudaCheckError();}for(int i=0;i<nStreams;i++){int offset = i * nByStream;CudaSafecCall(cudaMemcpyAsync(hostA+offset,deviceA+offset,nByStream*sizeof(float),cudaMemcpyDeviceToHost,streams[i]));}for(int i=0;i<nStreams;i++)CudaSafecCall(cudaStreamSynchronize(streams[i]));CudaSafecCall(cudaEventRecord(end));CudaSafecCall(cudaEventSynchronize(end));CudaSafecCall(cudaEventElapsedTime(&gpuTime,start,end));CudaSafecCall(cudaEventDestroy(start));CudaSafecCall(cudaEventDestroy(end));printf("gpu_MStream_wide %d个流广度优先调度,数据传输和计算耗时 %f ms\n",nStreams,gpuTime);for(int i=0;i<nStreams;i++)CudaSafecCall(cudaStreamDestroy(streams[i]));CudaSafecCall(cudaFreeHost(hostA));CudaSafecCall(cudaFree(deviceA));free(streams);return gpuTime;
}int main(int argc,char* argv[])
{int nStreams = argc==2? atoi(argv[1]):4;//gpu默认单流,主机非锁页内存,同步传输float gpuTime1 = gpu_base();//gpu默认单流,主机锁页内存,异步传输float gpuTime2 = gpu_base_pinMem();//gpu多流深度优先调度,异步传输float gpuTime3 = gpu_MStream_deep(nStreams);//gpu多流广度优先调度,异步传输float gpuTime4 = gpu_MStream_wide(nStreams);printf("相比默认单流同步传输与计算,单流异步传输及运算加速比为 %f\n",nStreams,gpuTime1/gpuTime2);printf("相比默认单流同步传输与计算,%d 个流深度优先调度异步传输及运算加速比为 %f\n",nStreams,gpuTime1/gpuTime3);printf("相比默认单流同步传输与计算,%d 个流广度优先调度异步传输及运算加速比为 %f\n",nStreams,gpuTime1/gpuTime4);return 0;
}
3. 测试结果
各项测试耗时及与单流同步传输和计算加速比
项目\流数量 | 4 | 8 | 16 | 32 | 64 |
---|---|---|---|---|---|
单流同步传输(耗时ms) | 312.43 | 298.41 | 305.48 | 306.54 | 310.75 |
单流异步传输(耗时ms/加速比) | 197.15/1.58 | 195.89/1.52 | 201.88/1.51 | 202.87/1.51 | 201.53/1.54 |
多流深度优先调度(耗时ms/加速比) | 151.04/2.06 | 129.95/2.29 | 131.49/2.32 | 123.08/2.49 | 126.48/2.45 |
多流广度优先调度(耗时ms/加速比) | 149.29/2.09 | 129.6/2.3 | 134.55/2.27 | 122.82/2.49 | 126.42/2.45 |