Lines Matching refs:shared
114 __shared__ MatScalar shared[WARP_SIZE][BLOCKY]; in matmult_seqsell_tiled_kernel9() local
129 if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t; in matmult_seqsell_tiled_kernel9()
131 if (tidy < sliceheight) t = shared[tidy][tidx]; in matmult_seqsell_tiled_kernel9()
134 if (tidx == 0 && tidy < sliceheight) shared[0][tidy] = t; in matmult_seqsell_tiled_kernel9()
136 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) y[row] = shared[0][threadIdx.x]; in matmult_seqsell_tiled_kernel9()
143 __shared__ MatScalar shared[WARP_SIZE][BLOCKY]; in matmultadd_seqsell_tiled_kernel9() local
158 if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t; in matmultadd_seqsell_tiled_kernel9()
160 if (tidy < sliceheight) t = shared[tidy][tidx]; in matmultadd_seqsell_tiled_kernel9()
163 if (tidx == 0 && tidy < sliceheight) shared[0][tidy] = t; in matmultadd_seqsell_tiled_kernel9()
165 …if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) z[row] = y[row] + shared[0][thre… in matmultadd_seqsell_tiled_kernel9()
169 __device__ __forceinline__ static bool segment_scan(PetscInt flag[], MatScalar shared[], PetscScala… in segment_scan() argument
175 shared[threadIdx.x + threadIdx.y * WARP_SIZE] = 0; in segment_scan()
177 shared[threadIdx.x + threadIdx.y * WARP_SIZE] = *val; in segment_scan()
181 …if (halfwarpid < BLOCKY * 2 - i) *val += shared[threadIdx.x + threadIdx.y * WARP_SIZE + i * WARP_S… in segment_scan()
191 __shared__ MatScalar shared[BLOCKY * WARP_SIZE]; in matmult_seqsell_tiled_kernel8() local
215 write = segment_scan<BLOCKY>(flag, shared, &t); in matmult_seqsell_tiled_kernel8()
227 …if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx… in matmult_seqsell_tiled_kernel8()
229 if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */ in matmult_seqsell_tiled_kernel8()
232 if (tidx == 0 && tidy < sliceheight) shared[tidy] = t; /* shared[0][tidy] = t */ in matmult_seqsell_tiled_kernel8()
234 …if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(y[row], shared[threadIdx… in matmult_seqsell_tiled_kernel8()
246 __shared__ MatScalar shared[BLOCKY * WARP_SIZE]; in matmultadd_seqsell_tiled_kernel8() local
270 write = segment_scan<BLOCKY>(flag, shared, &t); in matmultadd_seqsell_tiled_kernel8()
282 …if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx… in matmultadd_seqsell_tiled_kernel8()
284 if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */ in matmultadd_seqsell_tiled_kernel8()
287 if (tidx == 0 && tidy < sliceheight) shared[tidy] = t; /* shared[0][tidy] = t */ in matmultadd_seqsell_tiled_kernel8()
289 …if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(z[row], shared[threadIdx… in matmultadd_seqsell_tiled_kernel8()
333 __shared__ MatScalar shared[32 * 16]; in PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END() local
341 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; in PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
342 … * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * bloc… in PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
344 …if (threadIdx.y < 16) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) … in PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
346 …if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * … in PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
348 …if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * … in PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
350 …if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * … in PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
353 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; in PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
354 y[row] = shared[threadIdx.x]; in PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
361 __shared__ MatScalar shared[32 * 16]; in matmult_seqsell_tiled_kernel5() local
369 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; in matmult_seqsell_tiled_kernel5()
370 … * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * bloc… in matmult_seqsell_tiled_kernel5()
372 …if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * … in matmult_seqsell_tiled_kernel5()
374 …if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * … in matmult_seqsell_tiled_kernel5()
376 …if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * … in matmult_seqsell_tiled_kernel5()
379 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; in matmult_seqsell_tiled_kernel5()
380 y[row] = shared[threadIdx.x]; in matmult_seqsell_tiled_kernel5()
387 __shared__ MatScalar shared[32 * 16]; in matmult_seqsell_tiled_kernel4() local
395 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; in matmult_seqsell_tiled_kernel4()
396 … * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * bloc… in matmult_seqsell_tiled_kernel4()
398 …if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * … in matmult_seqsell_tiled_kernel4()
400 …if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * … in matmult_seqsell_tiled_kernel4()
403 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; in matmult_seqsell_tiled_kernel4()
404 y[row] = shared[threadIdx.x]; in matmult_seqsell_tiled_kernel4()
411 __shared__ MatScalar shared[32 * 16]; in matmult_seqsell_tiled_kernel3() local
419 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; in matmult_seqsell_tiled_kernel3()
420 … * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * bloc… in matmult_seqsell_tiled_kernel3()
422 …if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * … in matmult_seqsell_tiled_kernel3()
425 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; in matmult_seqsell_tiled_kernel3()
426 y[row] = shared[threadIdx.x]; in matmult_seqsell_tiled_kernel3()
433 __shared__ MatScalar shared[32 * 16]; in matmult_seqsell_tiled_kernel2() local
441 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; in matmult_seqsell_tiled_kernel2()
442 … * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * bloc… in matmult_seqsell_tiled_kernel2()
445 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; in matmult_seqsell_tiled_kernel2()
446 y[row] = shared[threadIdx.x]; in matmult_seqsell_tiled_kernel2()
453 __shared__ MatScalar shared[32 * 16]; in matmultadd_seqsell_tiled_kernel6() local
461 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; in matmultadd_seqsell_tiled_kernel6()
462 … * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * bloc… in matmultadd_seqsell_tiled_kernel6()
464 …if (threadIdx.y < 16) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) … in matmultadd_seqsell_tiled_kernel6()
466 …if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * … in matmultadd_seqsell_tiled_kernel6()
468 …if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * … in matmultadd_seqsell_tiled_kernel6()
470 …if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * … in matmultadd_seqsell_tiled_kernel6()
473 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; in matmultadd_seqsell_tiled_kernel6()
474 z[row] = y[row] + shared[threadIdx.x]; in matmultadd_seqsell_tiled_kernel6()
481 __shared__ MatScalar shared[32 * 16]; in matmultadd_seqsell_tiled_kernel5() local
489 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; in matmultadd_seqsell_tiled_kernel5()
490 … * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * bloc… in matmultadd_seqsell_tiled_kernel5()
492 …if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * … in matmultadd_seqsell_tiled_kernel5()
494 …if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * … in matmultadd_seqsell_tiled_kernel5()
496 …if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * … in matmultadd_seqsell_tiled_kernel5()
499 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; in matmultadd_seqsell_tiled_kernel5()
500 z[row] = y[row] + shared[threadIdx.x]; in matmultadd_seqsell_tiled_kernel5()
507 __shared__ MatScalar shared[32 * 16]; in matmultadd_seqsell_tiled_kernel4() local
515 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; in matmultadd_seqsell_tiled_kernel4()
516 … * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * bloc… in matmultadd_seqsell_tiled_kernel4()
518 …if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * … in matmultadd_seqsell_tiled_kernel4()
520 …if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * … in matmultadd_seqsell_tiled_kernel4()
523 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; in matmultadd_seqsell_tiled_kernel4()
524 z[row] = y[row] + shared[threadIdx.x]; in matmultadd_seqsell_tiled_kernel4()
531 __shared__ MatScalar shared[32 * 16]; in matmultadd_seqsell_tiled_kernel3() local
539 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; in matmultadd_seqsell_tiled_kernel3()
540 … * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * bloc… in matmultadd_seqsell_tiled_kernel3()
542 …if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * … in matmultadd_seqsell_tiled_kernel3()
545 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; in matmultadd_seqsell_tiled_kernel3()
546 z[row] = y[row] + shared[threadIdx.x]; in matmultadd_seqsell_tiled_kernel3()
553 __shared__ MatScalar shared[32 * 16]; in matmultadd_seqsell_tiled_kernel2() local
561 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; in matmultadd_seqsell_tiled_kernel2()
562 … * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * bloc… in matmultadd_seqsell_tiled_kernel2()
565 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; in matmultadd_seqsell_tiled_kernel2()
566 z[row] = y[row] + shared[threadIdx.x]; in matmultadd_seqsell_tiled_kernel2()