Atomic operation 可以保證一次只有一個 thread 會對某塊記憶體進行存取。 CUDA 提供了各種 atomic operation,例如 atomic{Add, Sub, Exch, Min, Max, Inc, Dec} 等等。 以下示範 atomicAdd:
#include<stdio.h> #include<time.h> #include<stdlib.h> #define SIZE 100000 #define RANGE 100 __global__ static void initHist(int *h){ h[threadIdx.x] = 0; } __global__ static void hist(int *a, int *h, int len){ int idx = blockIdx.x * blockDim.x + threadIdx.x; if(idx<len){ atomicAdd(&h[a[idx]], 1); } } int a[SIZE], hist1[RANGE], hist2[RANGE]; int main(int argc,char **argv) { int *gpu_a, *gpu_hist; int i; srand(time(NULL)); for(i=0;i<RANGE;i++){ hist1[i] = 0; } for(i=0;i<SIZE;i++){ a[i] = rand() % 100; hist1[a[i]]++; } cudaMalloc((void**)&gpu_a, sizeof(int) * SIZE); cudaMalloc((void**)&gpu_hist, sizeof(int) * RANGE); cudaMemcpy(gpu_a, a, sizeof(int) * SIZE, cudaMemcpyHostToDevice); initHist<<<1, RANGE>>>(gpu_hist); cudaThreadSynchronize(); hist<<<(SIZE/1024)+1, 1024>>>(gpu_a, gpu_hist, SIZE); cudaThreadSynchronize(); cudaMemcpy(&hist2, gpu_hist, sizeof(int) * RANGE, cudaMemcpyDeviceToHost); for(i=0;i<RANGE;i++){ printf("%3d %5d %5d\n", i, hist1[i], hist2[i]); } return 0; }