1. 目标:对数组进行求和,并做优化对比
2. baseline 代码
相邻求和: 根据blockSize对数据分块,并将数据放在共享内存,以线程块为单位,块内线程数量=数据个数,相邻配对,用其中第一个元素索引为ID的线程进行计算,计算结果放在第一个元素位置,循环进行下一轮计算,最后块求和计算结果赋值到全局内存以blockIdx.x为id的数组中,拷贝到主机对数组求和得到所有数字之和。
#include <stdio.h>
#include "common.h"#define BLOCKSIZE 512__global__ void kernel_reduce(float* in,long N,float* out,int gridSize)
{//每个线程块加载与线程数量相同的数据__shared__ float arrShared[BLOCKSIZE];int tid = threadIdx.x;//共享内存数据初始化arrShared[tid] = in[blockIdx.x*blockDim.x + tid];__syncthreads();//线程块内计算for(long s=1;s<=BLOCKSIZE/2;s*=2){if(0 == tid % (2*s))arrShared[tid] += arrShared[tid+s];__syncthreads();}if(tid ==0)out[blockIdx.x] = arrShared[0];
}int main(int argc, char ** argv)
{long N =1<<10;if(argc > 1)N = 1<<atoi(argv[1]);N *= 32;//GPU计算参数float gpuTime = 0;int gridSize = (N-1)/BLOCKSIZE+1; //数组初始化printf("N %ld ,GridSize %d\n",N,gridSize);//为了保证对比准确性,最后求和不计入耗时对比float cpuTime = 0;float* arrHost = (float*)calloc(N,sizeof(float));float* arrResult = (float*)calloc(gridSize,sizeof(float));float resultHost = 0;initialData(arrHost,N);double start = cpuSecond();for(int i=0;i<N/BLOCKSIZE;i++){float temp =0;for(int j=0;j<BLOCKSIZE;j++)temp += arrHost[i*BLOCKSIZE+j];arrResult[i] = temp;}for(int i=0;i<N%BLOCKSIZE;i++)arrResult[gridSize-1] += arrHost[N/BLOCKSIZE*BLOCKSIZE + i];double end = cpuSecond();cpuTime = (end - start)*1000;//gpufloat *arrD = NULL;float *resultD = NULL;float *resultFromD = NULL;float resultGpu = 0;CudaSafeCall(cudaMalloc((void**)&arrD,N*sizeof(float)));CudaSafeCall(cudaMalloc((void**)&resultD,gridSize*sizeof(float)));resultFromD = (float*)calloc(gridSize,sizeof(float));cudaEvent_t startD;cudaEvent_t endD;CudaSafeCall(cudaEventCreate(&startD));CudaSafeCall(cudaEventCreate(&endD));CudaSafeCall(cudaEventRecord(startD));CudaSafeCall(cudaMemcpy(arrD,arrHost,N*sizeof(float),cudaMemcpyHostToDevice));kernel_reduce<<<gridSize,BLOCKSIZE,sizeof(float)*BLOCKSIZE>>>(arrD,N,resultD,gridSize);CudaCheckError();CudaSafeCall(cudaMemcpy(resultFromD,resultD,gridSize*sizeof(float),cudaMemcpyDeviceToHost));CudaSafeCall(cudaEventRecord(endD));cudaEventSynchronize(endD);CudaSafeCall(cudaEventElapsedTime(&gpuTime,startD,endD));CudaSafeCall(cudaEventDestroy(startD));CudaSafeCall(cudaEventDestroy(endD));//汇总求和for(int i=0;i<gridSize;i++){resultHost += arrResult[i];resultGpu += resultFromD[i];}printf("数据量 %ld ;串行结算结果为%.3f,耗时 %.3f ms;GPU计算结果为%.3f,耗时 %.3f ms;加速比为%.3f\n",N,resultHost,cpuTime,resultGpu,gpuTime,cpuTime/gpuTime);CudaSafeCall(cudaFree(arrD));CudaSafeCall(cudaFree(resultD));free(arrHost);free(resultFromD);return 0;
}
3. 优化代码及思路
优化一:从全局内存加载时计算,一个线程块计算8个线程块的数据
优化二:从全局内存加载数据时使用合并访问,一个线程处理的数据索引步长大小为 blocksize 大小,
这样处理的话在一个线程束中访存可以对 cacheline 连续命中
优化三:线程束内做循环展开
优化四:用 shfl API 线程束内访问其他线程寄存器
#include <stdio.h>
#include "common.h"#define BLOCK_SIZE 256
#define NPerThread 8 //每个线程初始化数据个数
#define WARP_SIZE 32 //线程束大小
// #define warpNum 1024/WARP_SIZE//优化三:线程束内的规约,并做循环展开
//优化四:使用 shfl API 可以在warp内跨线程访问寄存器数据
template <int blockSize>
__device__ float warp_reduce(float sum)
// __device__ float warpReduceSum(float sum)
{//防止线程块大小设置太小,做判断if(blockSize>=32)//向前面的线程传递寄存器数值sum += __shfl_down_sync(0xffffffff,sum,16);if(blockSize>=16) sum += __shfl_down_sync(0xffffffff,sum,8);if(blockSize>=8) sum += __shfl_down_sync(0xffffffff,sum,4);if(blockSize>=4) sum += __shfl_down_sync(0xffffffff,sum,2);if(blockSize>=2) sum += __shfl_down_sync(0xffffffff,sum,1);return sum;
}//线程块大小 每个线程要处理的个数
template <int blockSize, int NUM_PER_THREAD>
__global__ void block_reduce(float* g_in,float* g_out)
{float sum = 0; //保存当前线程加和的数值int tid = threadIdx.x;#if 0int tempId1 = blockIdx.x*blockDim.x*NUM_PER_THREAD;int tempId2 = tid*NUM_PER_THREAD; //优化:从全局内存取数据时,每个线程取多个数据//共享内存数据初始化,循环展开#pragma unroolfor(int i=0;i<NUM_PER_THREAD;i++) sum += g_in[tempId1 + tempId2 + i];#else //优化二:全局内存合并访问增加线程束一次访存的缓存命中,//在 NUM_PER_THREAD 个线程块相同位置分别取数据给线程求和//而不是对一个块中连续内存访问求和int temp1 = blockIdx.x * blockSize * NUM_PER_THREAD + tid;#pragma unroolfor(int i=0;i<NUM_PER_THREAD;i++)//优化一:加载时计算sum += g_in[temp1 + i * blockSize];#endif//因为线程块最大1024个线程,所以线程束规约结果数量为 1024/WARP_SIZE = 32__shared__ float arrWrap[WARP_SIZE];//做warp内规约//使用 shfl 接口,传入要从其他线程复制的寄存器变量 sumsum = warp_reduce<blockSize>(sum);//wrap内 0号线程将结果保存到共享内存if(tid % WARP_SIZE == 0)arrWrap[tid/WARP_SIZE] = sum;__syncthreads();//将warp规约结果放入第一个warp内sum = (tid<WARP_SIZE) ? arrWrap[tid] : 0;//对第一个warp规约if((tid / WARP_SIZE) == 0)sum = warp_reduce<blockSize/WARP_SIZE>(sum);//保存整个线程块规约结果if(tid == 0)g_out[blockIdx.x] = sum;
}int main(int argc, char **argv)
{printf("0\n");int N = 1<<10;if(argc > 1)N = 1<<atoi(argv[1]);N *= 32;//GPU参数计算float gpuTime = 0;printf("1\n");int gridSize = (N-1)/(BLOCK_SIZE*NPerThread)+1; // int gridSize = (N-1)/BLOCK_SIZE + 1;printf("N %ld ,GridSize %d\n",N,gridSize);//cpufloat cpuTime = 0;float* arrHost = (float*)calloc(N,sizeof(float));float* arrResult = (float*)calloc(gridSize,sizeof(float));float resultHost = 0;initialData(arrHost,N);double start = cpuSecond();int number = BLOCK_SIZE*NPerThread;for(int i=0;i < N/number;i++){float temp = 0;for(int j=0;j<number;j++)temp += arrHost[i*number+j];arrResult[i] = temp;}for(int i=0;i < N%(BLOCK_SIZE*NPerThread);i++)arrResult[gridSize-1] += arrHost[N/number*number + i];double end = cpuSecond();cpuTime = (end - start)*1000;//gpufloat *arrD = NULL;float *resultD = NULL;float *resultFromD = NULL;float resultGpu = 0;CudaSafeCall(cudaMalloc((void**)&arrD,N*sizeof(float)));CudaSafeCall(cudaMalloc((void**)&resultD,gridSize*sizeof(float)));resultFromD = (float*)calloc(gridSize,sizeof(float));cudaEvent_t startD;cudaEvent_t endD;CudaSafeCall(cudaEventCreate(&startD));CudaSafeCall(cudaEventCreate(&endD));CudaSafeCall(cudaMemcpy(arrD,arrHost,N*sizeof(float),cudaMemcpyHostToDevice));CudaSafeCall(cudaEventRecord(startD,0));block_reduce<BLOCK_SIZE,NPerThread><<<gridSize,BLOCK_SIZE,sizeof(float)*WARP_SIZE>>>(arrD,resultD);CudaCheckError();CudaSafeCall(cudaEventRecord(endD,0));CudaSafeCall(cudaMemcpy(resultFromD,resultD,gridSize*sizeof(float),cudaMemcpyDeviceToHost));cudaEventSynchronize(endD);CudaSafeCall(cudaEventElapsedTime(&gpuTime,startD,endD));CudaSafeCall(cudaEventDestroy(startD));CudaSafeCall(cudaEventDestroy(endD));//汇总求和for(int i=0;i<gridSize;i++){resultHost += arrResult[i];resultGpu += resultFromD[i];}printf("数据量 %ld ;串行结算结果为%.3f,耗时 %.3f ms;GPU计算结果为%.3f,耗时 %.3f ms;加速比为%.3f\n",N,resultHost,cpuTime,resultGpu,gpuTime,cpuTime/gpuTime);CudaSafeCall(cudaFree(arrD));CudaSafeCall(cudaFree(resultD));free(arrHost);free(arrResult);free(resultFromD);return 0;
}
4. 测试结果
串行(ms) | gpu(ms) | 加速比 | |
---|---|---|---|
baseline | 95.2 | 5.3 | 17.8 |
优化 | 89.1 | 1.029 | 87.2 |
5. 结果分析
(1)baseline版本相邻求和,时间复杂度从 O(n),降为log(n)
(2)经测试优化中的优化二合并访问内存能加速3倍,还有加载时计算也对性能提升较大。