Lines Matching refs:nrows
75 static __global__ void matmult_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const Pet… in matmult_seqsell_basic_kernel() argument
81 if (row < nrows) { in matmult_seqsell_basic_kernel()
90 static __global__ void matmultadd_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const … in matmultadd_seqsell_basic_kernel() argument
96 if (row < nrows) { in matmultadd_seqsell_basic_kernel()
108 __global__ void matmult_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt … in matmult_seqsell_tiled_kernel9() argument
119 if (row < nrows) { in matmult_seqsell_tiled_kernel9()
132 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) y[row] = shared[0][threadIdx.x]; in matmult_seqsell_tiled_kernel9()
137 __global__ void matmultadd_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscI… in matmultadd_seqsell_tiled_kernel9() argument
148 if (row < nrows) { in matmultadd_seqsell_tiled_kernel9()
161 …if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) z[row] = y[row] + shared[0][thre… in matmultadd_seqsell_tiled_kernel9()
185 __global__ void matmult_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunks… in matmult_seqsell_tiled_kernel8() argument
192 for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * 32 * BLOCKY); iter++) { in matmult_seqsell_tiled_kernel8()
194 if (gid < nrows) y[gid] = 0.0; in matmult_seqsell_tiled_kernel8()
204 …PetscInt slice_id = start_slice, totalslices = PetscCeilIntMacro(nrows, sliceheight), t… in matmult_seqsell_tiled_kernel8()
209 if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]]; in matmult_seqsell_tiled_kernel8()
212 if (row < nrows && gid < totalentries && write) atomAdd(y[row], t); in matmult_seqsell_tiled_kernel8()
216 if (row < nrows) t += aval[gid] * x[acolidx[gid]]; in matmult_seqsell_tiled_kernel8()
230 …if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(y[row], shared[threadIdx… in matmult_seqsell_tiled_kernel8()
240 __global__ void matmultadd_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chu… in matmultadd_seqsell_tiled_kernel8() argument
247 for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * 32 * BLOCKY); iter++) { in matmultadd_seqsell_tiled_kernel8()
249 if (gid < nrows) z[gid] = y[gid]; in matmultadd_seqsell_tiled_kernel8()
259 …PetscInt slice_id = start_slice, totalslices = PetscCeilIntMacro(nrows, sliceheight), t… in matmultadd_seqsell_tiled_kernel8()
264 if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]]; in matmultadd_seqsell_tiled_kernel8()
267 if (row < nrows && gid < totalentries && write) atomAdd(z[row], t); in matmultadd_seqsell_tiled_kernel8()
271 if (row < nrows) t += aval[gid] * x[acolidx[gid]]; in matmultadd_seqsell_tiled_kernel8()
285 …if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(z[row], shared[threadIdx… in matmultadd_seqsell_tiled_kernel8()
294 static __global__ void matmult_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const Pe… in matmult_seqsell_tiled_kernel7() argument
300 if (row < nrows) { in matmult_seqsell_tiled_kernel7()
305 if (row < nrows && threadIdx.x < sliceheight) y[row] = t; in matmult_seqsell_tiled_kernel7()
309 static __global__ void matmultadd_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const… in matmultadd_seqsell_tiled_kernel7() argument
315 if (row < nrows) { in matmultadd_seqsell_tiled_kernel7()
320 if (row < nrows && threadIdx.x < sliceheight) z[row] = y[row] + t; in matmultadd_seqsell_tiled_kernel7()
326 static __global__ void matmult_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const… in matmult_seqsell_tiled_kernel6() argument
332 if (row < nrows) { in matmult_seqsell_tiled_kernel6()
354 static __global__ void matmult_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const… in matmult_seqsell_tiled_kernel5() argument
360 if (row < nrows) { in matmult_seqsell_tiled_kernel5()
380 static __global__ void matmult_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const… in matmult_seqsell_tiled_kernel4() argument
386 if (row < nrows) { in matmult_seqsell_tiled_kernel4()
404 static __global__ void matmult_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const… in matmult_seqsell_tiled_kernel3() argument
410 if (row < nrows) { in matmult_seqsell_tiled_kernel3()
426 static __global__ void matmult_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const… in matmult_seqsell_tiled_kernel2() argument
432 if (row < nrows) { in matmult_seqsell_tiled_kernel2()
446 static __global__ void matmultadd_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, co… in matmultadd_seqsell_tiled_kernel6() argument
452 if (row < nrows) { in matmultadd_seqsell_tiled_kernel6()
474 static __global__ void matmultadd_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, co… in matmultadd_seqsell_tiled_kernel5() argument
480 if (row < nrows) { in matmultadd_seqsell_tiled_kernel5()
500 static __global__ void matmultadd_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, co… in matmultadd_seqsell_tiled_kernel4() argument
506 if (row < nrows) { in matmultadd_seqsell_tiled_kernel4()
524 static __global__ void matmultadd_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, co… in matmultadd_seqsell_tiled_kernel3() argument
530 if (row < nrows) { in matmultadd_seqsell_tiled_kernel3()
546 static __global__ void matmultadd_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, co… in matmultadd_seqsell_tiled_kernel2() argument
552 if (row < nrows) { in matmultadd_seqsell_tiled_kernel2()
572 PetscInt nrows = A->rmap->n, sliceheight = a->sliceheight; in MatMult_SeqSELLCUDA() local
599 nblocks = 1 + (nrows - 1) / sliceheight; in MatMult_SeqSELLCUDA()
601 …matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, slii… in MatMult_SeqSELLCUDA()
603 …matmult_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, slii… in MatMult_SeqSELLCUDA()
605 …matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, slii… in MatMult_SeqSELLCUDA()
607 …matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sl… in MatMult_SeqSELLCUDA()
609 …matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sl… in MatMult_SeqSELLCUDA()
611 …matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, slii… in MatMult_SeqSELLCUDA()
615 nblocks = 1 + (nrows - 1) / (2 * sliceheight); in MatMult_SeqSELLCUDA()
617 …matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx,… in MatMult_SeqSELLCUDA()
619 …matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx,… in MatMult_SeqSELLCUDA()
621 …matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx,… in MatMult_SeqSELLCUDA()
623 …matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx… in MatMult_SeqSELLCUDA()
625 …matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx… in MatMult_SeqSELLCUDA()
627 …matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx,… in MatMult_SeqSELLCUDA()
632 nblocks = 1 + (nrows - 1) / (blocksize / 32); /* 1 slice per block if blocksize=512 */ in MatMult_SeqSELLCUDA()
633 matmult_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y); in MatMult_SeqSELLCUDA()
636 nblocks = 1 + (nrows - 1) / (blocksize / 16); /* 2 slices per block if blocksize=512*/ in MatMult_SeqSELLCUDA()
637 matmult_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y); in MatMult_SeqSELLCUDA()
640 nblocks = 1 + (nrows - 1) / (blocksize / 8); /* 4 slices per block if blocksize=512 */ in MatMult_SeqSELLCUDA()
641 matmult_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y); in MatMult_SeqSELLCUDA()
644 nblocks = 1 + (nrows - 1) / (blocksize / 4); /* 8 slices per block if blocksize=512 */ in MatMult_SeqSELLCUDA()
645 matmult_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y); in MatMult_SeqSELLCUDA()
648 nblocks = 1 + (nrows - 1) / (blocksize / 2); in MatMult_SeqSELLCUDA()
649 matmult_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y); in MatMult_SeqSELLCUDA()
652 nblocks = 1 + (nrows - 1) / blocksize; in MatMult_SeqSELLCUDA()
653 …matmult_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x,… in MatMult_SeqSELLCUDA()
658 if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */ in MatMult_SeqSELLCUDA()
666 …matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nch… in MatMult_SeqSELLCUDA()
668 …matmult_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nch… in MatMult_SeqSELLCUDA()
670 …matmult_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nch… in MatMult_SeqSELLCUDA()
672 …matmult_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, n… in MatMult_SeqSELLCUDA()
674 …matmult_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, n… in MatMult_SeqSELLCUDA()
676 …matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nch… in MatMult_SeqSELLCUDA()
681 if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) { in MatMult_SeqSELLCUDA()
682 nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */ in MatMult_SeqSELLCUDA()
683 …matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx,… in MatMult_SeqSELLCUDA()
685 nblocks = 1 + (nrows - 1) / sliceheight; in MatMult_SeqSELLCUDA()
686 …matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, slii… in MatMult_SeqSELLCUDA()
689 nblocks = 1 + (nrows - 1) / sliceheight; in MatMult_SeqSELLCUDA()
690 …matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, slii… in MatMult_SeqSELLCUDA()
692 nblocks = 1 + (nrows - 1) / sliceheight; in MatMult_SeqSELLCUDA()
693 …matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sl… in MatMult_SeqSELLCUDA()
714 PetscInt nrows = A->rmap->n, sliceheight = a->sliceheight; in MatMultAdd_SeqSELLCUDA() local
739 nblocks = 1 + (nrows - 1) / sliceheight; in MatMultAdd_SeqSELLCUDA()
741 …matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, s… in MatMultAdd_SeqSELLCUDA()
743 …matmultadd_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, s… in MatMultAdd_SeqSELLCUDA()
745 …matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, s… in MatMultAdd_SeqSELLCUDA()
747 …matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval,… in MatMultAdd_SeqSELLCUDA()
749 …matmultadd_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval,… in MatMultAdd_SeqSELLCUDA()
751 …matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, s… in MatMultAdd_SeqSELLCUDA()
762 …matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, … in MatMultAdd_SeqSELLCUDA()
764 …matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, … in MatMultAdd_SeqSELLCUDA()
766 …matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, … in MatMultAdd_SeqSELLCUDA()
768 …matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock… in MatMultAdd_SeqSELLCUDA()
770 …matmultadd_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock… in MatMultAdd_SeqSELLCUDA()
772 …matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, … in MatMultAdd_SeqSELLCUDA()
776 nblocks = 1 + (nrows - 1) / (2 * sliceheight); in MatMultAdd_SeqSELLCUDA()
778 …matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, slii… in MatMultAdd_SeqSELLCUDA()
780 …matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, slii… in MatMultAdd_SeqSELLCUDA()
782 …matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, slii… in MatMultAdd_SeqSELLCUDA()
784 …matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sli… in MatMultAdd_SeqSELLCUDA()
786 …matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sli… in MatMultAdd_SeqSELLCUDA()
788 …matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, slii… in MatMultAdd_SeqSELLCUDA()
793 nblocks = 1 + (nrows - 1) / (blocksize / 32); in MatMultAdd_SeqSELLCUDA()
794 matmultadd_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y, z); in MatMultAdd_SeqSELLCUDA()
797 nblocks = 1 + (nrows - 1) / (blocksize / 16); in MatMultAdd_SeqSELLCUDA()
798 matmultadd_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y, z); in MatMultAdd_SeqSELLCUDA()
801 nblocks = 1 + (nrows - 1) / (blocksize / 8); in MatMultAdd_SeqSELLCUDA()
802 matmultadd_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y, z); in MatMultAdd_SeqSELLCUDA()
805 nblocks = 1 + (nrows - 1) / (blocksize / 4); in MatMultAdd_SeqSELLCUDA()
806 matmultadd_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y, z); in MatMultAdd_SeqSELLCUDA()
809 nblocks = 1 + (nrows - 1) / (blocksize / 2); in MatMultAdd_SeqSELLCUDA()
810 matmultadd_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y, z); in MatMultAdd_SeqSELLCUDA()
813 nblocks = 1 + (nrows - 1) / blocksize; in MatMultAdd_SeqSELLCUDA()
814 …matmultadd_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx,… in MatMultAdd_SeqSELLCUDA()
819 if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */ in MatMultAdd_SeqSELLCUDA()
827 …matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, … in MatMultAdd_SeqSELLCUDA()
829 …matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, … in MatMultAdd_SeqSELLCUDA()
831 …matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, … in MatMultAdd_SeqSELLCUDA()
833 …matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock… in MatMultAdd_SeqSELLCUDA()
835 …matmultadd_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock… in MatMultAdd_SeqSELLCUDA()
837 …matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, … in MatMultAdd_SeqSELLCUDA()
842 if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) { in MatMultAdd_SeqSELLCUDA()
843 nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */ in MatMultAdd_SeqSELLCUDA()
844 …matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, slii… in MatMultAdd_SeqSELLCUDA()
846 nblocks = 1 + (nrows - 1) / sliceheight; in MatMultAdd_SeqSELLCUDA()
847 …matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, s… in MatMultAdd_SeqSELLCUDA()
850 nblocks = 1 + (nrows - 1) / sliceheight; in MatMultAdd_SeqSELLCUDA()
851 …matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, s… in MatMultAdd_SeqSELLCUDA()
853 nblocks = 1 + (nrows - 1) / sliceheight; in MatMultAdd_SeqSELLCUDA()
854 …matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval,… in MatMultAdd_SeqSELLCUDA()