Lines Matching refs:sliceheight
78 static __global__ void matmult_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const Pet… in matmult_seqsell_basic_kernel() argument
85 slice_id = row / sliceheight; in matmult_seqsell_basic_kernel()
86 row_in_slice = row % sliceheight; in matmult_seqsell_basic_kernel()
88 …for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[… in matmult_seqsell_basic_kernel()
93 static __global__ void matmultadd_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const … in matmultadd_seqsell_basic_kernel() argument
100 slice_id = row / sliceheight; in matmultadd_seqsell_basic_kernel()
101 row_in_slice = row % sliceheight; in matmultadd_seqsell_basic_kernel()
103 …for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[… in matmultadd_seqsell_basic_kernel()
112 __global__ void matmult_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt … in matmult_seqsell_tiled_kernel9() argument
122 row = slice_id * sliceheight + threadIdx.x % sliceheight; in matmult_seqsell_tiled_kernel9()
127 for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) t += __shfl_down(t, offset); in matmult_seqsell_tiled_kernel9()
129 if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t; in matmult_seqsell_tiled_kernel9()
131 if (tidy < sliceheight) t = shared[tidy][tidx]; in matmult_seqsell_tiled_kernel9()
134 if (tidx == 0 && tidy < sliceheight) shared[0][tidy] = t; in matmult_seqsell_tiled_kernel9()
136 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) y[row] = shared[0][threadIdx.x]; in matmult_seqsell_tiled_kernel9()
141 __global__ void matmultadd_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscI… in matmultadd_seqsell_tiled_kernel9() argument
151 row = slice_id * sliceheight + threadIdx.x % sliceheight; in matmultadd_seqsell_tiled_kernel9()
156 for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) t += __shfl_down(t, offset); in matmultadd_seqsell_tiled_kernel9()
158 if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t; in matmultadd_seqsell_tiled_kernel9()
160 if (tidy < sliceheight) t = shared[tidy][tidx]; in matmultadd_seqsell_tiled_kernel9()
163 if (tidx == 0 && tidy < sliceheight) shared[0][tidy] = t; in matmultadd_seqsell_tiled_kernel9()
165 …if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) z[row] = y[row] + shared[0][thre… in matmultadd_seqsell_tiled_kernel9()
189 __global__ void matmult_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunks… in matmult_seqsell_tiled_kernel8() argument
208 … slice_id = start_slice, totalslices = PetscCeilIntMacro(nrows, sliceheight), totalentries … in matmult_seqsell_tiled_kernel8()
212 row = slice_id * sliceheight + threadIdx.x % sliceheight; in matmult_seqsell_tiled_kernel8()
219 row = start_slice * sliceheight + threadIdx.x % sliceheight; in matmult_seqsell_tiled_kernel8()
225 … for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) t += __shfl_down(t, offset); in matmult_seqsell_tiled_kernel8()
227 …if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx… in matmult_seqsell_tiled_kernel8()
229 if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */ in matmult_seqsell_tiled_kernel8()
232 if (tidx == 0 && tidy < sliceheight) shared[tidy] = t; /* shared[0][tidy] = t */ in matmult_seqsell_tiled_kernel8()
234 …if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(y[row], shared[threadIdx… in matmult_seqsell_tiled_kernel8()
244 __global__ void matmultadd_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chu… in matmultadd_seqsell_tiled_kernel8() argument
263 … slice_id = start_slice, totalslices = PetscCeilIntMacro(nrows, sliceheight), totalentries … in matmultadd_seqsell_tiled_kernel8()
267 row = slice_id * sliceheight + threadIdx.x % sliceheight; in matmultadd_seqsell_tiled_kernel8()
274 row = start_slice * sliceheight + threadIdx.x % sliceheight; in matmultadd_seqsell_tiled_kernel8()
280 … for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) t += __shfl_down(t, offset); in matmultadd_seqsell_tiled_kernel8()
282 …if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx… in matmultadd_seqsell_tiled_kernel8()
284 if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */ in matmultadd_seqsell_tiled_kernel8()
287 if (tidx == 0 && tidy < sliceheight) shared[tidy] = t; /* shared[0][tidy] = t */ in matmultadd_seqsell_tiled_kernel8()
289 …if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(z[row], shared[threadIdx… in matmultadd_seqsell_tiled_kernel8()
298 static __global__ void matmult_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const Pe… in matmult_seqsell_tiled_kernel7() argument
302 row = slice_id * sliceheight + threadIdx.x % sliceheight; in matmult_seqsell_tiled_kernel7()
308 for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) t += __shfl_down(t, offset); in matmult_seqsell_tiled_kernel7()
309 if (row < nrows && threadIdx.x < sliceheight) y[row] = t; in matmult_seqsell_tiled_kernel7()
313 static __global__ void matmultadd_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const… in matmultadd_seqsell_tiled_kernel7() argument
317 row = slice_id * sliceheight + threadIdx.x % sliceheight; in matmultadd_seqsell_tiled_kernel7()
323 for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) t += __shfl_down(t, offset); in matmultadd_seqsell_tiled_kernel7()
324 if (row < nrows && threadIdx.x < sliceheight) z[row] = y[row] + t; in matmultadd_seqsell_tiled_kernel7()
331 static __global__ void matmult_seqsell_tiled_kernel6(PetscInt nrows, PetscInt sliceheight, const Pe… in PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
338 slice_id = row / sliceheight; in PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
339 row_in_slice = row % sliceheight; in PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
342 …or (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i +… in PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
359 static __global__ void matmult_seqsell_tiled_kernel5(PetscInt nrows, PetscInt sliceheight, const Pe… in matmult_seqsell_tiled_kernel5() argument
366 slice_id = row / sliceheight; in matmult_seqsell_tiled_kernel5()
367 row_in_slice = row % sliceheight; in matmult_seqsell_tiled_kernel5()
370 …or (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i +… in matmult_seqsell_tiled_kernel5()
385 static __global__ void matmult_seqsell_tiled_kernel4(PetscInt nrows, PetscInt sliceheight, const Pe… in matmult_seqsell_tiled_kernel4() argument
392 slice_id = row / sliceheight; in matmult_seqsell_tiled_kernel4()
393 row_in_slice = row % sliceheight; in matmult_seqsell_tiled_kernel4()
396 …or (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i +… in matmult_seqsell_tiled_kernel4()
409 static __global__ void matmult_seqsell_tiled_kernel3(PetscInt nrows, PetscInt sliceheight, const Pe… in matmult_seqsell_tiled_kernel3() argument
416 slice_id = row / sliceheight; in matmult_seqsell_tiled_kernel3()
417 row_in_slice = row % sliceheight; in matmult_seqsell_tiled_kernel3()
420 …or (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i +… in matmult_seqsell_tiled_kernel3()
431 static __global__ void matmult_seqsell_tiled_kernel2(PetscInt nrows, PetscInt sliceheight, const Pe… in matmult_seqsell_tiled_kernel2() argument
438 slice_id = row / sliceheight; in matmult_seqsell_tiled_kernel2()
439 row_in_slice = row % sliceheight; in matmult_seqsell_tiled_kernel2()
442 …or (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i +… in matmult_seqsell_tiled_kernel2()
451 static __global__ void matmultadd_seqsell_tiled_kernel6(PetscInt nrows, PetscInt sliceheight, const… in matmultadd_seqsell_tiled_kernel6() argument
458 slice_id = row / sliceheight; in matmultadd_seqsell_tiled_kernel6()
459 row_in_slice = row % sliceheight; in matmultadd_seqsell_tiled_kernel6()
462 …or (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i +… in matmultadd_seqsell_tiled_kernel6()
479 static __global__ void matmultadd_seqsell_tiled_kernel5(PetscInt nrows, PetscInt sliceheight, const… in matmultadd_seqsell_tiled_kernel5() argument
486 slice_id = row / sliceheight; in matmultadd_seqsell_tiled_kernel5()
487 row_in_slice = row % sliceheight; in matmultadd_seqsell_tiled_kernel5()
490 …or (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i +… in matmultadd_seqsell_tiled_kernel5()
505 static __global__ void matmultadd_seqsell_tiled_kernel4(PetscInt nrows, PetscInt sliceheight, const… in matmultadd_seqsell_tiled_kernel4() argument
512 slice_id = row / sliceheight; in matmultadd_seqsell_tiled_kernel4()
513 row_in_slice = row % sliceheight; in matmultadd_seqsell_tiled_kernel4()
516 …or (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i +… in matmultadd_seqsell_tiled_kernel4()
529 static __global__ void matmultadd_seqsell_tiled_kernel3(PetscInt nrows, PetscInt sliceheight, const… in matmultadd_seqsell_tiled_kernel3() argument
536 slice_id = row / sliceheight; in matmultadd_seqsell_tiled_kernel3()
537 row_in_slice = row % sliceheight; in matmultadd_seqsell_tiled_kernel3()
540 …or (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i +… in matmultadd_seqsell_tiled_kernel3()
551 static __global__ void matmultadd_seqsell_tiled_kernel2(PetscInt nrows, PetscInt sliceheight, const… in matmultadd_seqsell_tiled_kernel2() argument
558 slice_id = row / sliceheight; in matmultadd_seqsell_tiled_kernel2()
559 row_in_slice = row % sliceheight; in matmultadd_seqsell_tiled_kernel2()
562 …or (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i +… in matmultadd_seqsell_tiled_kernel2()
577 PetscInt nrows = A->rmap->n, sliceheight = a->sliceheight; in MatMult_SeqSELLHIP() local
589 …sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor… in MatMult_SeqSELLHIP()
590 …sliceheight > 32), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the s… in MatMult_SeqSELLHIP()
604 nblocks = 1 + (nrows - 1) / sliceheight; in MatMult_SeqSELLHIP()
606 …matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, ava… in MatMult_SeqSELLHIP()
608 …matmult_seqsell_tiled_kernel9<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, acolidx, ava… in MatMult_SeqSELLHIP()
610 …matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, ava… in MatMult_SeqSELLHIP()
612 …matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, a… in MatMult_SeqSELLHIP()
614 …matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, ava… in MatMult_SeqSELLHIP()
618 nblocks = 1 + (nrows - 1) / (hipstruct->blocky * sliceheight); in MatMult_SeqSELLHIP()
620 …matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, … in MatMult_SeqSELLHIP()
622 …matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, acolidx, aval, … in MatMult_SeqSELLHIP()
624 …matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, … in MatMult_SeqSELLHIP()
626 …matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval,… in MatMult_SeqSELLHIP()
628 nblocks = 1 + (nrows - 1) / (2 * sliceheight); in MatMult_SeqSELLHIP()
629 …matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, … in MatMult_SeqSELLHIP()
635 …matmult_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, sliceheight, acolidx, aval, sliidx, x, … in MatMult_SeqSELLHIP()
639 …matmult_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, sliceheight, acolidx, aval, sliidx, x, … in MatMult_SeqSELLHIP()
643 …matmult_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y… in MatMult_SeqSELLHIP()
647 …matmult_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y… in MatMult_SeqSELLHIP()
651 …matmult_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y… in MatMult_SeqSELLHIP()
655 …matmult_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x,… in MatMult_SeqSELLHIP()
668 …matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblo… in MatMult_SeqSELLHIP()
670 …matmult_seqsell_tiled_kernel8<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, chunksperblo… in MatMult_SeqSELLHIP()
672 …matmult_seqsell_tiled_kernel8<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, chunksperblo… in MatMult_SeqSELLHIP()
674 …matmult_seqsell_tiled_kernel8<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, chunksperb… in MatMult_SeqSELLHIP()
676 …matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblo… in MatMult_SeqSELLHIP()
679 PetscInt avgslicesize = sliceheight * a->avgslicewidth; in MatMult_SeqSELLHIP()
681 if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) { in MatMult_SeqSELLHIP()
682 nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */ in MatMult_SeqSELLHIP()
683 …matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, … in MatMult_SeqSELLHIP()
685 nblocks = 1 + (nrows - 1) / sliceheight; in MatMult_SeqSELLHIP()
686 …matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, ava… in MatMult_SeqSELLHIP()
689 nblocks = 1 + (nrows - 1) / sliceheight; in MatMult_SeqSELLHIP()
690 …matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, ava… in MatMult_SeqSELLHIP()
692 nblocks = 1 + (nrows - 1) / sliceheight; in MatMult_SeqSELLHIP()
693 …matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, a… in MatMult_SeqSELLHIP()
714 PetscInt nrows = A->rmap->n, sliceheight = a->sliceheight; in MatMultAdd_SeqSELLHIP() local
725 …sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor… in MatMultAdd_SeqSELLHIP()
726 …sliceheight != sliceheight), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requ… in MatMultAdd_SeqSELLHIP()
739 nblocks = 1 + (nrows - 1) / sliceheight; in MatMultAdd_SeqSELLHIP()
741 …matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, … in MatMultAdd_SeqSELLHIP()
743 …matmultadd_seqsell_tiled_kernel9<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, acolidx, … in MatMultAdd_SeqSELLHIP()
745 …matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, … in MatMultAdd_SeqSELLHIP()
747 …matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx… in MatMultAdd_SeqSELLHIP()
749 …matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, … in MatMultAdd_SeqSELLHIP()
760 …matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksper… in MatMultAdd_SeqSELLHIP()
762 …matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, chunksper… in MatMultAdd_SeqSELLHIP()
764 …matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, chunksper… in MatMultAdd_SeqSELLHIP()
766 …matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, chunksp… in MatMultAdd_SeqSELLHIP()
768 …matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksper… in MatMultAdd_SeqSELLHIP()
772 nblocks = 1 + (nrows - 1) / (blocky * sliceheight); in MatMultAdd_SeqSELLHIP()
774 …matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, ava… in MatMultAdd_SeqSELLHIP()
776 …matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, acolidx, ava… in MatMultAdd_SeqSELLHIP()
778 …matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, ava… in MatMultAdd_SeqSELLHIP()
780 …matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, av… in MatMultAdd_SeqSELLHIP()
782 nblocks = 1 + (nrows - 1) / (2 * sliceheight); in MatMultAdd_SeqSELLHIP()
783 …matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, ava… in MatMultAdd_SeqSELLHIP()
789 …matmultadd_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, sliceheight, acolidx, aval, sliidx, … in MatMultAdd_SeqSELLHIP()
793 …matmultadd_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, sliceheight, acolidx, aval, sliidx, … in MatMultAdd_SeqSELLHIP()
797 …matmultadd_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, sliceheight, acolidx, aval, sliidx, x… in MatMultAdd_SeqSELLHIP()
801 …matmultadd_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, sliceheight, acolidx, aval, sliidx, x… in MatMultAdd_SeqSELLHIP()
805 …matmultadd_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, sliceheight, acolidx, aval, sliidx, x… in MatMultAdd_SeqSELLHIP()
809 …matmultadd_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx,… in MatMultAdd_SeqSELLHIP()
822 …matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksper… in MatMultAdd_SeqSELLHIP()
824 …matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, chunksper… in MatMultAdd_SeqSELLHIP()
826 …matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, chunksper… in MatMultAdd_SeqSELLHIP()
828 …matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, chunksp… in MatMultAdd_SeqSELLHIP()
830 …matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksper… in MatMultAdd_SeqSELLHIP()
833 PetscInt avgslicesize = sliceheight * a->avgslicewidth; in MatMultAdd_SeqSELLHIP()
835 if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) { in MatMultAdd_SeqSELLHIP()
836 nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */ in MatMultAdd_SeqSELLHIP()
837 …matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, ava… in MatMultAdd_SeqSELLHIP()
839 nblocks = 1 + (nrows - 1) / sliceheight; in MatMultAdd_SeqSELLHIP()
840 …matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, … in MatMultAdd_SeqSELLHIP()
843 nblocks = 1 + (nrows - 1) / sliceheight; in MatMultAdd_SeqSELLHIP()
844 …matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, … in MatMultAdd_SeqSELLHIP()
846 nblocks = 1 + (nrows - 1) / sliceheight; in MatMultAdd_SeqSELLHIP()
847 …matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx… in MatMultAdd_SeqSELLHIP()