本範例以一個 block 來進行向量元素的總和:
#include<stdio.h> #include<time.h> #include<stdlib.h> #define SIZE 1000 __global__ static void vecSum(int *a, int *sum, int len){ int i; __shared__ int partialSum[1000]; partialSum[threadIdx.x] = 0; for(i=threadIdx.x;i<len;i+=blockDim.x){ partialSum[threadIdx.x] += a[i]; } if(threadIdx.x==0){ *sum = 0; for(i=0;i<1000;i++){ *sum += partialSum[i]; } } } int a[SIZE]; int main(int argc,char **argv) { int *gpu_a, *gpu_s; int i, s1 = 0, s2 = 0; srand(time(NULL)); for(i=0;i<SIZE;i++){ a[i] = rand() % 100; s1 += a[i]; } cudaMalloc((void**)&gpu_a, sizeof(int) * SIZE); cudaMalloc((void**)&gpu_s, sizeof(int) * 1); cudaMemcpy(gpu_a, a, sizeof(int) * SIZE, cudaMemcpyHostToDevice); vecSum<<<1, 1000>>>(gpu_a, gpu_s, SIZE); cudaThreadSynchronize(); cudaMemcpy(&s2, gpu_s, sizeof(int) * 1, cudaMemcpyDeviceToHost); printf("CPU: %d, GPU: %d\n", s1, s2); return 0; }上述範例的 kernel 中,最後做加總只使用到 thread 0,很沒有效率。現在來嘗試做一點修改:
#include<stdio.h> #include<time.h> #include<stdlib.h> #define SIZE 100000 __global__ static void vecSum(int *a, int *sum, int len){ int i, offset; __shared__ int partialSum[1024]; partialSum[threadIdx.x] = 0; for(i=threadIdx.x;i<len;i+=blockDim.x){ partialSum[threadIdx.x] += a[i]; } __syncthreads(); offset = blockDim.x / 2; while(offset>0){ if(threadIdx.x<offset){ partialSum[threadIdx.x]+= partialSum[threadIdx.x+offset]; } offset /= 2; __syncthreads(); } if(threadIdx.x==0){ *sum = partialSum[0]; } } int a[SIZE]; int main(int argc,char **argv) { int *gpu_a, *gpu_s; int i, s1 = 0, s2 = 0; srand(time(NULL)); for(i=0;i<SIZE;i++){ a[i] = rand() * 100; s1 += a[i]; } cudaMalloc((void**)&gpu_a, sizeof(int) * SIZE); cudaMalloc((void**)&gpu_s, sizeof(int) * 1); cudaMemcpy(gpu_a, a, sizeof(int) * SIZE, cudaMemcpyHostToDevice); vecSum<<<1, 1024>>>(gpu_a, gpu_s, SIZE); cudaThreadSynchronize(); cudaMemcpy(&s2, gpu_s, sizeof(int) * 1, cudaMemcpyDeviceToHost); printf("CPU: %d, GPU: %d\n", s1, s2); return 0; }這種方法,我們稱為 parallel reduction。關於更詳細的改進過程,可以參考官方資訊。
另外,shared memory 也可以動態的指定大小。只要把:
__shared__ int partialSum[1024];換成:
extern __shared__ int partialSum[];並且在 kernel 啟動的時候,多加一個參數指定大小即可。