__global__ void ArrayMul( float *A, float *B, float *C ) { __shared__ float prods[BLOCKSIZE]; unsigned int numItems = blockDim.x; unsigned int tnum = threadIdx.x; unsigned int wgNum = blockIdx.x; unsigned int gid = blockIdx.x*blockDim.x + threadIdx.x; prods[tnum] = A[gid] * B[gid]; for (int offset = 1; offset < numItems; offset *= 2) { int mask = 2 * offset - 1; __syncthreads(); if ((tnum & mask) == 0) { prods[tnum] += prods[tnum + offset]; } } __syncthreads(); if (tnum == 0) C[wgNum] = prods[0]; }