Atomic operation 可以保證一次只有一個 thread 會對某塊記憶體進行存取。 CUDA 提供了各種 atomic operation,例如 atomic{Add, Sub, Exch, Min, Max, Inc, Dec} 等等。 以下示範 atomicAdd:

#include<stdio.h>
#include<time.h>
#include<stdlib.h>

#define SIZE 100000
#define RANGE 100

__global__ static void initHist(int *h){
	h[threadIdx.x] = 0;
}

__global__ static void hist(int *a, int *h, int len){
	int idx = blockIdx.x * blockDim.x + threadIdx.x;
	if(idx<len){
		atomicAdd(&h[a[idx]], 1);
	}
}

int a[SIZE], hist1[RANGE], hist2[RANGE];

int main(int argc,char **argv)
{
	int *gpu_a, *gpu_hist;
	int i;
	
	srand(time(NULL));
	for(i=0;i<RANGE;i++){
		hist1[i] = 0;
	}
	for(i=0;i<SIZE;i++){
		a[i] = rand() % 100;
		hist1[a[i]]++;
	}

	cudaMalloc((void**)&gpu_a, sizeof(int) * SIZE);
	cudaMalloc((void**)&gpu_hist, sizeof(int) * RANGE);
	cudaMemcpy(gpu_a, a, sizeof(int) * SIZE, cudaMemcpyHostToDevice);
	
	initHist<<<1, RANGE>>>(gpu_hist);
	cudaThreadSynchronize();
	hist<<<(SIZE/1024)+1, 1024>>>(gpu_a, gpu_hist, SIZE);
	cudaThreadSynchronize();
	
	cudaMemcpy(&hist2, gpu_hist, sizeof(int) * RANGE, cudaMemcpyDeviceToHost);

	for(i=0;i<RANGE;i++){
		printf("%3d %5d %5d\n", i, hist1[i], hist2[i]);
	}
	return 0;
}