Lines Matching refs:shared

110   __shared__ MatScalar shared[32][BLOCKY];  in matmult_seqsell_tiled_kernel9()  local
125 if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t; in matmult_seqsell_tiled_kernel9()
127 if (tidy < sliceheight) t = shared[tidy][tidx]; in matmult_seqsell_tiled_kernel9()
130 if (tidx == 0 && tidy < sliceheight) shared[0][tidy] = t; in matmult_seqsell_tiled_kernel9()
132 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) y[row] = shared[0][threadIdx.x]; in matmult_seqsell_tiled_kernel9()
139 __shared__ MatScalar shared[32][BLOCKY]; in matmultadd_seqsell_tiled_kernel9() local
154 if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t; in matmultadd_seqsell_tiled_kernel9()
156 if (tidy < sliceheight) t = shared[tidy][tidx]; in matmultadd_seqsell_tiled_kernel9()
159 if (tidx == 0 && tidy < sliceheight) shared[0][tidy] = t; in matmultadd_seqsell_tiled_kernel9()
161 …if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) z[row] = y[row] + shared[0][thre… in matmultadd_seqsell_tiled_kernel9()
165 __device__ __forceinline__ static bool segment_scan(PetscInt flag[], MatScalar shared[], PetscScala… in segment_scan() argument
171 shared[threadIdx.x + threadIdx.y * 32] = 0; in segment_scan()
173 shared[threadIdx.x + threadIdx.y * 32] = *val; in segment_scan()
177 if (halfwarpid < BLOCKY * 2 - i) *val += shared[threadIdx.x + threadIdx.y * 32 + i * 16]; in segment_scan()
187 __shared__ MatScalar shared[BLOCKY * 32]; in matmult_seqsell_tiled_kernel8() local
211 write = segment_scan<BLOCKY>(flag, shared, &t); in matmult_seqsell_tiled_kernel8()
223 …if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx… in matmult_seqsell_tiled_kernel8()
225 if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */ in matmult_seqsell_tiled_kernel8()
228 if (tidx == 0 && tidy < sliceheight) shared[tidy] = t; /* shared[0][tidy] = t */ in matmult_seqsell_tiled_kernel8()
230 …if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(y[row], shared[threadIdx… in matmult_seqsell_tiled_kernel8()
242 __shared__ MatScalar shared[BLOCKY * 32]; in matmultadd_seqsell_tiled_kernel8() local
266 write = segment_scan<BLOCKY>(flag, shared, &t); in matmultadd_seqsell_tiled_kernel8()
278 …if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx… in matmultadd_seqsell_tiled_kernel8()
280 if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */ in matmultadd_seqsell_tiled_kernel8()
283 if (tidx == 0 && tidy < sliceheight) shared[tidy] = t; /* shared[0][tidy] = t */ in matmultadd_seqsell_tiled_kernel8()
285 …if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(z[row], shared[threadIdx… in matmultadd_seqsell_tiled_kernel8()
328 __shared__ MatScalar shared[512]; in matmult_seqsell_tiled_kernel6() local
336 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; in matmult_seqsell_tiled_kernel6()
337 …* threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * bloc… in matmult_seqsell_tiled_kernel6()
339 …if (threadIdx.y < 16) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) … in matmult_seqsell_tiled_kernel6()
341 …if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * … in matmult_seqsell_tiled_kernel6()
343 …if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * … in matmult_seqsell_tiled_kernel6()
345 …if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * … in matmult_seqsell_tiled_kernel6()
348 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; in matmult_seqsell_tiled_kernel6()
349 y[row] = shared[threadIdx.x]; in matmult_seqsell_tiled_kernel6()
356 __shared__ MatScalar shared[512]; in matmult_seqsell_tiled_kernel5() local
364 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; in matmult_seqsell_tiled_kernel5()
365 …* threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * bloc… in matmult_seqsell_tiled_kernel5()
367 …if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * … in matmult_seqsell_tiled_kernel5()
369 …if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * … in matmult_seqsell_tiled_kernel5()
371 …if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * … in matmult_seqsell_tiled_kernel5()
374 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; in matmult_seqsell_tiled_kernel5()
375 y[row] = shared[threadIdx.x]; in matmult_seqsell_tiled_kernel5()
382 __shared__ MatScalar shared[512]; in matmult_seqsell_tiled_kernel4() local
390 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; in matmult_seqsell_tiled_kernel4()
391 …* threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * bloc… in matmult_seqsell_tiled_kernel4()
393 …if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * … in matmult_seqsell_tiled_kernel4()
395 …if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * … in matmult_seqsell_tiled_kernel4()
398 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; in matmult_seqsell_tiled_kernel4()
399 y[row] = shared[threadIdx.x]; in matmult_seqsell_tiled_kernel4()
406 __shared__ MatScalar shared[512]; in matmult_seqsell_tiled_kernel3() local
414 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; in matmult_seqsell_tiled_kernel3()
415 …* threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * bloc… in matmult_seqsell_tiled_kernel3()
417 …if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * … in matmult_seqsell_tiled_kernel3()
420 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; in matmult_seqsell_tiled_kernel3()
421 y[row] = shared[threadIdx.x]; in matmult_seqsell_tiled_kernel3()
428 __shared__ MatScalar shared[512]; in matmult_seqsell_tiled_kernel2() local
436 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; in matmult_seqsell_tiled_kernel2()
437 …* threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * bloc… in matmult_seqsell_tiled_kernel2()
440 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; in matmult_seqsell_tiled_kernel2()
441 y[row] = shared[threadIdx.x]; in matmult_seqsell_tiled_kernel2()
448 __shared__ MatScalar shared[512]; in matmultadd_seqsell_tiled_kernel6() local
456 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; in matmultadd_seqsell_tiled_kernel6()
457 …* threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * bloc… in matmultadd_seqsell_tiled_kernel6()
459 …if (threadIdx.y < 16) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) … in matmultadd_seqsell_tiled_kernel6()
461 …if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * … in matmultadd_seqsell_tiled_kernel6()
463 …if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * … in matmultadd_seqsell_tiled_kernel6()
465 …if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * … in matmultadd_seqsell_tiled_kernel6()
468 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; in matmultadd_seqsell_tiled_kernel6()
469 z[row] = y[row] + shared[threadIdx.x]; in matmultadd_seqsell_tiled_kernel6()
476 __shared__ MatScalar shared[512]; in matmultadd_seqsell_tiled_kernel5() local
484 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; in matmultadd_seqsell_tiled_kernel5()
485 …* threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * bloc… in matmultadd_seqsell_tiled_kernel5()
487 …if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * … in matmultadd_seqsell_tiled_kernel5()
489 …if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * … in matmultadd_seqsell_tiled_kernel5()
491 …if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * … in matmultadd_seqsell_tiled_kernel5()
494 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; in matmultadd_seqsell_tiled_kernel5()
495 z[row] = y[row] + shared[threadIdx.x]; in matmultadd_seqsell_tiled_kernel5()
502 __shared__ MatScalar shared[512]; in matmultadd_seqsell_tiled_kernel4() local
510 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; in matmultadd_seqsell_tiled_kernel4()
511 …* threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * bloc… in matmultadd_seqsell_tiled_kernel4()
513 …if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * … in matmultadd_seqsell_tiled_kernel4()
515 …if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * … in matmultadd_seqsell_tiled_kernel4()
518 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; in matmultadd_seqsell_tiled_kernel4()
519 z[row] = y[row] + shared[threadIdx.x]; in matmultadd_seqsell_tiled_kernel4()
526 __shared__ MatScalar shared[512]; in matmultadd_seqsell_tiled_kernel3() local
534 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; in matmultadd_seqsell_tiled_kernel3()
535 …* threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * bloc… in matmultadd_seqsell_tiled_kernel3()
537 …if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * … in matmultadd_seqsell_tiled_kernel3()
540 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; in matmultadd_seqsell_tiled_kernel3()
541 z[row] = y[row] + shared[threadIdx.x]; in matmultadd_seqsell_tiled_kernel3()
548 __shared__ MatScalar shared[512]; in matmultadd_seqsell_tiled_kernel2() local
556 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; in matmultadd_seqsell_tiled_kernel2()
557 …* threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * bloc… in matmultadd_seqsell_tiled_kernel2()
560 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; in matmultadd_seqsell_tiled_kernel2()
561 z[row] = y[row] + shared[threadIdx.x]; in matmultadd_seqsell_tiled_kernel2()