1 #include <petscdevice_cuda.h> 2 #include <petsc/private/cupmatomics.hpp> 3 #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 4 5 #define SLICE_HEIGHT 16 6 7 typedef struct { 8 PetscInt maxallocmat; 9 PetscInt totalentries; 10 PetscInt *colidx; /* column index array, device pointer */ 11 MatScalar *val; /* value array, device pointer */ 12 PetscInt totalslices; 13 PetscInt *sliidx; /* slice index array, device pointer */ 14 PetscInt nonzerostate; 15 PetscInt kernelchoice; 16 PetscInt blocky; 17 PetscInt chunksperblock; 18 PetscInt totalchunks; 19 PetscInt *chunk_slice_map; /* starting slice for each chunk, device pointer */ 20 } Mat_SeqSELLCUDA; 21 22 static PetscErrorCode MatSeqSELLCUDA_Destroy(Mat_SeqSELLCUDA **cudastruct) 23 { 24 PetscFunctionBegin; 25 if (*cudastruct) { 26 if ((*cudastruct)->colidx) PetscCallCUDA(cudaFree((*cudastruct)->colidx)); 27 if ((*cudastruct)->val) PetscCallCUDA(cudaFree((*cudastruct)->val)); 28 if ((*cudastruct)->sliidx) PetscCallCUDA(cudaFree((*cudastruct)->sliidx)); 29 if ((*cudastruct)->chunk_slice_map) PetscCallCUDA(cudaFree((*cudastruct)->chunk_slice_map)); 30 PetscCall(PetscFree(*cudastruct)); 31 } 32 PetscFunctionReturn(PETSC_SUCCESS); 33 } 34 35 static PetscErrorCode MatSeqSELLCUDACopyToGPU(Mat A) 36 { 37 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 38 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 39 40 PetscFunctionBegin; 41 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 42 PetscCall(PetscLogEventBegin(MAT_CUDACopyToGPU, A, 0, 0, 0)); 43 if (A->assembled && A->nonzerostate == cudastruct->nonzerostate) { 44 /* copy values only */ 45 PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice)); 46 PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar)))); 47 } else { 48 if (cudastruct->colidx) PetscCallCUDA(cudaFree(cudastruct->colidx)); 49 if (cudastruct->val) PetscCallCUDA(cudaFree(cudastruct->val)); 50 if (cudastruct->sliidx) PetscCallCUDA(cudaFree(cudastruct->sliidx)); 51 if (cudastruct->chunk_slice_map) PetscCallCUDA(cudaFree(cudastruct->chunk_slice_map)); 52 cudastruct->maxallocmat = a->maxallocmat; 53 cudastruct->totalentries = a->sliidx[a->totalslices]; 54 cudastruct->totalslices = a->totalslices; 55 cudastruct->totalchunks = a->totalchunks; 56 PetscCallCUDA(cudaMalloc((void **)&cudastruct->colidx, a->maxallocmat * sizeof(*cudastruct->colidx))); 57 PetscCallCUDA(cudaMalloc((void **)&cudastruct->val, a->maxallocmat * sizeof(*cudastruct->val))); 58 /* copy values, nz or maxallocmat? */ 59 PetscCallCUDA(cudaMemcpy(cudastruct->colidx, a->colidx, a->sliidx[a->totalslices] * sizeof(*a->colidx), cudaMemcpyHostToDevice)); 60 PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(*a->val), cudaMemcpyHostToDevice)); 61 62 PetscCallCUDA(cudaMalloc((void **)&cudastruct->sliidx, (a->totalslices + 1) * sizeof(*cudastruct->sliidx))); 63 PetscCallCUDA(cudaMemcpy(cudastruct->sliidx, a->sliidx, (a->totalslices + 1) * sizeof(*a->sliidx), cudaMemcpyHostToDevice)); 64 PetscCallCUDA(cudaMalloc((void **)&cudastruct->chunk_slice_map, a->totalchunks * sizeof(*cudastruct->chunk_slice_map))); 65 PetscCallCUDA(cudaMemcpy(cudastruct->chunk_slice_map, a->chunk_slice_map, a->totalchunks * sizeof(*a->chunk_slice_map), cudaMemcpyHostToDevice)); 66 PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar) + sizeof(PetscInt)) + (a->totalslices + 1 + a->totalchunks) * sizeof(PetscInt))); 67 } 68 PetscCallCUDA(WaitForCUDA()); 69 PetscCall(PetscLogEventEnd(MAT_CUDACopyToGPU, A, 0, 0, 0)); 70 A->offloadmask = PETSC_OFFLOAD_BOTH; 71 } 72 PetscFunctionReturn(PETSC_SUCCESS); 73 } 74 75 static __global__ void matmult_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 76 { 77 PetscInt i, row, slice_id, row_in_slice; 78 MatScalar sum; 79 /* one thread per row. */ 80 row = blockIdx.x * blockDim.x + threadIdx.x; 81 if (row < nrows) { 82 slice_id = row / sliceheight; 83 row_in_slice = row % sliceheight; 84 sum = 0.0; 85 for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]]; 86 y[row] = sum; 87 } 88 } 89 90 static __global__ void matmultadd_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 91 { 92 PetscInt i, row, slice_id, row_in_slice; 93 MatScalar sum; 94 /* one thread per row. */ 95 row = blockIdx.x * blockDim.x + threadIdx.x; 96 if (row < nrows) { 97 slice_id = row / sliceheight; 98 row_in_slice = row % sliceheight; 99 sum = 0.0; 100 for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]]; 101 z[row] = y[row] + sum; 102 } 103 } 104 105 #if !defined(PETSC_USE_COMPLEX) 106 /* use 1 block per slice, suitable for large slice width */ 107 template <int BLOCKY> 108 __global__ void matmult_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 109 { 110 __shared__ MatScalar shared[32][BLOCKY]; 111 PetscInt i, row, slice_id = blockIdx.x; 112 int tid = threadIdx.x + threadIdx.y * 32; 113 /* transposed index */ 114 int tidx = tid % BLOCKY; 115 int tidy = tid / BLOCKY; 116 PetscScalar t = 0.0; 117 118 row = slice_id * sliceheight + threadIdx.x % sliceheight; 119 if (row < nrows) { 120 for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]]; 121 } 122 #pragma unroll 123 for (int offset = 16; offset >= sliceheight; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset); 124 /* transpose layout to reduce each row using warp shfl */ 125 if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t; 126 __syncthreads(); 127 if (tidy < sliceheight) t = shared[tidy][tidx]; 128 #pragma unroll 129 for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); 130 if (tidx == 0 && tidy < sliceheight) shared[0][tidy] = t; 131 __syncthreads(); 132 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) y[row] = shared[0][threadIdx.x]; 133 } 134 135 /* use 1 block per slice, suitable for large slice width */ 136 template <int BLOCKY> 137 __global__ void matmultadd_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 138 { 139 __shared__ MatScalar shared[32][BLOCKY]; 140 PetscInt i, row, slice_id = blockIdx.x; 141 int tid = threadIdx.x + threadIdx.y * 32; 142 /* transposed index */ 143 int tidx = tid % BLOCKY; 144 int tidy = tid / BLOCKY; 145 PetscScalar t = 0.0; 146 147 row = slice_id * sliceheight + threadIdx.x % sliceheight; 148 if (row < nrows) { 149 for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]]; 150 } 151 #pragma unroll 152 for (int offset = 16; offset >= sliceheight; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset); 153 /* transpose layout to reduce each row using warp shfl */ 154 if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t; 155 __syncthreads(); 156 if (tidy < sliceheight) t = shared[tidy][tidx]; 157 #pragma unroll 158 for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); 159 if (tidx == 0 && tidy < sliceheight) shared[0][tidy] = t; 160 __syncthreads(); 161 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) z[row] = y[row] + shared[0][threadIdx.x]; 162 } 163 164 template <int BLOCKY> 165 __device__ __forceinline__ static bool segment_scan(PetscInt flag[], MatScalar shared[], PetscScalar *val) 166 { 167 bool head = true; 168 #pragma unroll 169 for (int i = 1; i < BLOCKY * 2; i <<= 1) { 170 int halfwarpid = threadIdx.y * 2 + threadIdx.x / 16; 171 shared[threadIdx.x + threadIdx.y * 32] = 0; 172 if (halfwarpid >= i && flag[halfwarpid - i] == flag[halfwarpid]) { 173 shared[threadIdx.x + threadIdx.y * 32] = *val; 174 if (i == 1) head = false; 175 } 176 __syncthreads(); 177 if (halfwarpid < BLOCKY * 2 - i) *val += shared[threadIdx.x + threadIdx.y * 32 + i * 16]; 178 __syncthreads(); 179 } 180 return head; 181 } 182 183 /* load-balancing version. Chunksize is equal to the number of threads per block */ 184 template <int BLOCKY> 185 __global__ void matmult_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 186 { 187 __shared__ MatScalar shared[BLOCKY * 32]; 188 PetscInt gid, row, start_slice, cid; 189 PetscScalar t = 0.0; 190 AtomicAdd<MatScalar> atomAdd; 191 /* zero out y */ 192 for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * 32 * BLOCKY); iter++) { 193 gid = gridDim.x * 32 * BLOCKY * iter + blockIdx.x * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x; 194 if (gid < nrows) y[gid] = 0.0; 195 } 196 for (int iter = 0; iter < chunksperblock; iter++) { 197 cid = blockIdx.x * chunksperblock + iter; /* chunk id */ 198 if (cid < totalchunks) { 199 start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */ 200 gid = cid * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x; 201 if ((cid + 1) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */ 202 __shared__ PetscInt flag[BLOCKY * 2]; 203 bool write; 204 PetscInt slice_id = start_slice, totalslices = PetscCeilIntMacro(nrows, sliceheight), totalentries = sliidx[totalslices]; 205 /* find out the slice that this element belongs to */ 206 while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++; 207 if (threadIdx.x % 16 == 0) flag[threadIdx.y * 2 + threadIdx.x / 16] = slice_id; 208 row = slice_id * sliceheight + threadIdx.x % sliceheight; 209 if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]]; 210 __syncthreads(); 211 write = segment_scan<BLOCKY>(flag, shared, &t); 212 if (row < nrows && gid < totalentries && write) atomAdd(y[row], t); 213 t = 0.0; 214 } else { /* this iteration covers only one slice */ 215 row = start_slice * sliceheight + threadIdx.x % sliceheight; 216 if (row < nrows) t += aval[gid] * x[acolidx[gid]]; 217 if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */ 218 int tid = threadIdx.x + threadIdx.y * 32, tidx = tid % BLOCKY, tidy = tid / BLOCKY; 219 /* reduction and write to output vector */ 220 #pragma unroll 221 for (int offset = 16; offset >= sliceheight; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset); 222 /* transpose layout to reduce each row using warp shfl */ 223 if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */ 224 __syncthreads(); 225 if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */ 226 #pragma unroll 227 for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); 228 if (tidx == 0 && tidy < sliceheight) shared[tidy] = t; /* shared[0][tidy] = t */ 229 __syncthreads(); 230 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(y[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */ 231 t = 0.0; 232 } 233 } 234 } 235 } 236 } 237 238 /* load-balancing version. Chunksize is equal to the number of threads per block */ 239 template <int BLOCKY> 240 __global__ void matmultadd_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 241 { 242 __shared__ MatScalar shared[BLOCKY * 32]; 243 PetscInt gid, row, start_slice, cid; 244 PetscScalar t = 0.0; 245 AtomicAdd<MatScalar> atomAdd; 246 /* copy y to z */ 247 for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * 32 * BLOCKY); iter++) { 248 gid = gridDim.x * 32 * BLOCKY * iter + blockIdx.x * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x; 249 if (gid < nrows) z[gid] = y[gid]; 250 } 251 for (int iter = 0; iter < chunksperblock; iter++) { 252 cid = blockIdx.x * chunksperblock + iter; /* chunk id */ 253 if (cid < totalchunks) { 254 start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */ 255 gid = cid * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x; 256 if ((cid + 1) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */ 257 __shared__ PetscInt flag[BLOCKY * 2]; 258 bool write; 259 PetscInt slice_id = start_slice, totalslices = PetscCeilIntMacro(nrows, sliceheight), totalentries = sliidx[totalslices]; 260 /* find out the slice that this element belongs to */ 261 while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++; 262 if (threadIdx.x % 16 == 0) flag[threadIdx.y * 2 + threadIdx.x / 16] = slice_id; 263 row = slice_id * sliceheight + threadIdx.x % sliceheight; 264 if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]]; 265 __syncthreads(); 266 write = segment_scan<BLOCKY>(flag, shared, &t); 267 if (row < nrows && gid < totalentries && write) atomAdd(z[row], t); 268 t = 0.0; 269 } else { /* this iteration covers only one slice */ 270 row = start_slice * sliceheight + threadIdx.x % sliceheight; 271 if (row < nrows) t += aval[gid] * x[acolidx[gid]]; 272 if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */ 273 int tid = threadIdx.x + threadIdx.y * 32, tidx = tid % BLOCKY, tidy = tid / BLOCKY; 274 /* reduction and write to output vector */ 275 #pragma unroll 276 for (int offset = 16; offset >= sliceheight; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset); 277 /* transpose layout to reduce each row using warp shfl */ 278 if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */ 279 __syncthreads(); 280 if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */ 281 #pragma unroll 282 for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); 283 if (tidx == 0 && tidy < sliceheight) shared[tidy] = t; /* shared[0][tidy] = t */ 284 __syncthreads(); 285 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(z[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */ 286 t = 0.0; 287 } 288 } 289 } 290 } 291 } 292 293 /* use 1 warp per slice, suitable for small slice width */ 294 static __global__ void matmult_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 295 { 296 PetscInt i, row, slice_id; 297 slice_id = blockIdx.x * blockDim.y + threadIdx.y; 298 row = slice_id * sliceheight + threadIdx.x % sliceheight; 299 double t = 0.0; 300 if (row < nrows) { 301 for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]]; 302 } 303 #pragma unroll 304 for (int offset = 16; offset >= sliceheight; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset); 305 if (row < nrows && threadIdx.x < sliceheight) y[row] = t; 306 } 307 308 /* use 1 warp per slice, suitable for small slice width */ 309 static __global__ void matmultadd_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 310 { 311 PetscInt i, row, slice_id; 312 slice_id = blockIdx.x * blockDim.y + threadIdx.y; 313 row = slice_id * sliceheight + threadIdx.x % sliceheight; 314 double t = 0.0; 315 if (row < nrows) { 316 for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]]; 317 } 318 #pragma unroll 319 for (int offset = 16; offset >= sliceheight; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset); 320 if (row < nrows && threadIdx.x < sliceheight) z[row] = y[row] + t; 321 } 322 #endif 323 324 /*********** Kernel 2-6 are tied to slice height 16. They are kept only for performance comparison **********/ 325 326 static __global__ void matmult_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 327 { 328 __shared__ MatScalar shared[512]; 329 PetscInt i, row, slice_id, row_in_slice; 330 /* multiple threads per row. */ 331 row = blockIdx.x * blockDim.x + threadIdx.x; 332 if (row < nrows) { 333 slice_id = row / SLICE_HEIGHT; 334 row_in_slice = row % SLICE_HEIGHT; 335 336 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 337 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 338 __syncthreads(); 339 if (threadIdx.y < 16) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; 340 __syncthreads(); 341 if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; 342 __syncthreads(); 343 if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; 344 __syncthreads(); 345 if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; 346 __syncthreads(); 347 if (threadIdx.y < 1) { 348 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 349 y[row] = shared[threadIdx.x]; 350 } 351 } 352 } 353 354 static __global__ void matmult_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 355 { 356 __shared__ MatScalar shared[512]; 357 PetscInt i, row, slice_id, row_in_slice; 358 /* multiple threads per row. */ 359 row = blockIdx.x * blockDim.x + threadIdx.x; 360 if (row < nrows) { 361 slice_id = row / SLICE_HEIGHT; 362 row_in_slice = row % SLICE_HEIGHT; 363 364 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 365 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 366 __syncthreads(); 367 if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; 368 __syncthreads(); 369 if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; 370 __syncthreads(); 371 if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; 372 __syncthreads(); 373 if (threadIdx.y < 1) { 374 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 375 y[row] = shared[threadIdx.x]; 376 } 377 } 378 } 379 380 static __global__ void matmult_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 381 { 382 __shared__ MatScalar shared[512]; 383 PetscInt i, row, slice_id, row_in_slice; 384 /* multiple threads per row. */ 385 row = blockIdx.x * blockDim.x + threadIdx.x; 386 if (row < nrows) { 387 slice_id = row / SLICE_HEIGHT; 388 row_in_slice = row % SLICE_HEIGHT; 389 390 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 391 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 392 __syncthreads(); 393 if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; 394 __syncthreads(); 395 if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; 396 __syncthreads(); 397 if (threadIdx.y < 1) { 398 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 399 y[row] = shared[threadIdx.x]; 400 } 401 } 402 } 403 404 static __global__ void matmult_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 405 { 406 __shared__ MatScalar shared[512]; 407 PetscInt i, row, slice_id, row_in_slice; 408 /* multiple threads per row. */ 409 row = blockIdx.x * blockDim.x + threadIdx.x; 410 if (row < nrows) { 411 slice_id = row / SLICE_HEIGHT; 412 row_in_slice = row % SLICE_HEIGHT; 413 414 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 415 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 416 __syncthreads(); 417 if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; 418 __syncthreads(); 419 if (threadIdx.y < 1) { 420 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 421 y[row] = shared[threadIdx.x]; 422 } 423 } 424 } 425 426 static __global__ void matmult_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 427 { 428 __shared__ MatScalar shared[512]; 429 PetscInt i, row, slice_id, row_in_slice; 430 /* multiple threads per row. */ 431 row = blockIdx.x * blockDim.x + threadIdx.x; 432 if (row < nrows) { 433 slice_id = row / SLICE_HEIGHT; 434 row_in_slice = row % SLICE_HEIGHT; 435 436 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 437 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 438 __syncthreads(); 439 if (threadIdx.y < 1) { 440 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 441 y[row] = shared[threadIdx.x]; 442 } 443 } 444 } 445 446 static __global__ void matmultadd_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 447 { 448 __shared__ MatScalar shared[512]; 449 PetscInt i, row, slice_id, row_in_slice; 450 /* multiple threads per row. */ 451 row = blockIdx.x * blockDim.x + threadIdx.x; 452 if (row < nrows) { 453 slice_id = row / SLICE_HEIGHT; 454 row_in_slice = row % SLICE_HEIGHT; 455 456 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 457 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 458 __syncthreads(); 459 if (threadIdx.y < 16) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; 460 __syncthreads(); 461 if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; 462 __syncthreads(); 463 if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; 464 __syncthreads(); 465 if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; 466 __syncthreads(); 467 if (threadIdx.y < 1) { 468 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 469 z[row] = y[row] + shared[threadIdx.x]; 470 } 471 } 472 } 473 474 static __global__ void matmultadd_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 475 { 476 __shared__ MatScalar shared[512]; 477 PetscInt i, row, slice_id, row_in_slice; 478 /* multiple threads per row. */ 479 row = blockIdx.x * blockDim.x + threadIdx.x; 480 if (row < nrows) { 481 slice_id = row / SLICE_HEIGHT; 482 row_in_slice = row % SLICE_HEIGHT; 483 484 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 485 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 486 __syncthreads(); 487 if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; 488 __syncthreads(); 489 if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; 490 __syncthreads(); 491 if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; 492 __syncthreads(); 493 if (threadIdx.y < 1) { 494 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 495 z[row] = y[row] + shared[threadIdx.x]; 496 } 497 } 498 } 499 500 static __global__ void matmultadd_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 501 { 502 __shared__ MatScalar shared[512]; 503 PetscInt i, row, slice_id, row_in_slice; 504 /* multiple threads per row. */ 505 row = blockIdx.x * blockDim.x + threadIdx.x; 506 if (row < nrows) { 507 slice_id = row / SLICE_HEIGHT; 508 row_in_slice = row % SLICE_HEIGHT; 509 510 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 511 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 512 __syncthreads(); 513 if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; 514 __syncthreads(); 515 if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; 516 __syncthreads(); 517 if (threadIdx.y < 1) { 518 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 519 z[row] = y[row] + shared[threadIdx.x]; 520 } 521 } 522 } 523 524 static __global__ void matmultadd_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 525 { 526 __shared__ MatScalar shared[512]; 527 PetscInt i, row, slice_id, row_in_slice; 528 /* multiple threads per row. */ 529 row = blockIdx.x * blockDim.x + threadIdx.x; 530 if (row < nrows) { 531 slice_id = row / SLICE_HEIGHT; 532 row_in_slice = row % SLICE_HEIGHT; 533 534 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 535 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 536 __syncthreads(); 537 if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; 538 __syncthreads(); 539 if (threadIdx.y < 1) { 540 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 541 z[row] = y[row] + shared[threadIdx.x]; 542 } 543 } 544 } 545 546 static __global__ void matmultadd_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 547 { 548 __shared__ MatScalar shared[512]; 549 PetscInt i, row, slice_id, row_in_slice; 550 /* multiple threads per row. */ 551 row = blockIdx.x * blockDim.x + threadIdx.x; 552 if (row < nrows) { 553 slice_id = row / SLICE_HEIGHT; 554 row_in_slice = row % SLICE_HEIGHT; 555 556 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 557 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 558 __syncthreads(); 559 if (threadIdx.y < 1) { 560 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 561 z[row] = y[row] + shared[threadIdx.x]; 562 } 563 } 564 } 565 566 static PetscErrorCode MatMult_SeqSELLCUDA(Mat A, Vec xx, Vec yy) 567 { 568 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 569 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 570 PetscScalar *y; 571 const PetscScalar *x; 572 PetscInt nrows = A->rmap->n, sliceheight = a->sliceheight; 573 MatScalar *aval; 574 PetscInt *acolidx; 575 PetscInt *sliidx; 576 PetscInt nblocks, blocksize = 512; /* blocksize must be multiple of SLICE_HEIGHT*32 */ 577 dim3 block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32); 578 #if !defined(PETSC_USE_COMPLEX) 579 PetscInt chunksperblock, nchunks, *chunk_slice_map; 580 PetscReal maxoveravg; 581 #endif 582 583 PetscFunctionBegin; 584 PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight); 585 PetscCheck(!(cudastruct->kernelchoice >= 2 && cudastruct->kernelchoice <= 6 && sliceheight != SLICE_HEIGHT), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight); 586 PetscCall(MatSeqSELLCUDACopyToGPU(A)); 587 /* cudastruct may not be available until MatSeqSELLCUDACopyToGPU() is called */ 588 aval = cudastruct->val; 589 acolidx = cudastruct->colidx; 590 sliidx = cudastruct->sliidx; 591 592 PetscCall(VecCUDAGetArrayRead(xx, &x)); 593 PetscCall(VecCUDAGetArrayWrite(yy, &y)); 594 PetscCall(PetscLogGpuTimeBegin()); 595 596 switch (cudastruct->kernelchoice) { 597 #if !defined(PETSC_USE_COMPLEX) 598 case 9: 599 nblocks = 1 + (nrows - 1) / sliceheight; 600 if (cudastruct->blocky == 2) { 601 matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 602 } else if (cudastruct->blocky == 4) { 603 matmult_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 604 } else if (cudastruct->blocky == 8) { 605 matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 606 } else if (cudastruct->blocky == 16) { 607 matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 608 } else if (cudastruct->blocky == 32) { 609 matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 610 } else { 611 matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 612 } 613 break; 614 case 7: 615 nblocks = 1 + (nrows - 1) / (2 * sliceheight); 616 if (cudastruct->blocky == 2) { 617 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 618 } else if (cudastruct->blocky == 4) { 619 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 620 } else if (cudastruct->blocky == 8) { 621 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 622 } else if (cudastruct->blocky == 16) { 623 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 624 } else if (cudastruct->blocky == 32) { 625 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 626 } else { 627 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 628 } 629 break; 630 #endif 631 case 6: 632 nblocks = 1 + (nrows - 1) / (blocksize / 32); /* 1 slice per block if blocksize=512 */ 633 matmult_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y); 634 break; 635 case 5: 636 nblocks = 1 + (nrows - 1) / (blocksize / 16); /* 2 slices per block if blocksize=512*/ 637 matmult_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y); 638 break; 639 case 4: 640 nblocks = 1 + (nrows - 1) / (blocksize / 8); /* 4 slices per block if blocksize=512 */ 641 matmult_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y); 642 break; 643 case 3: 644 nblocks = 1 + (nrows - 1) / (blocksize / 4); /* 8 slices per block if blocksize=512 */ 645 matmult_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y); 646 break; 647 case 2: /* 16 slices per block if blocksize=512 */ 648 nblocks = 1 + (nrows - 1) / (blocksize / 2); 649 matmult_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y); 650 break; 651 case 1: /* 32 slices per block if blocksize=512 */ 652 nblocks = 1 + (nrows - 1) / blocksize; 653 matmult_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 654 break; 655 #if !defined(PETSC_USE_COMPLEX) 656 case 0: 657 maxoveravg = a->maxslicewidth / a->avgslicewidth; 658 if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */ 659 /* each block handles approximately one slice */ 660 PetscInt blocky = a->chunksize / 32; 661 nchunks = cudastruct->totalchunks; 662 chunksperblock = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize; 663 nblocks = 1 + (nchunks - 1) / chunksperblock; 664 chunk_slice_map = cudastruct->chunk_slice_map; 665 if (blocky == 2) { 666 matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 667 } else if (blocky == 4) { 668 matmult_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 669 } else if (blocky == 8) { 670 matmult_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 671 } else if (blocky == 16) { 672 matmult_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 673 } else if (blocky == 32) { 674 matmult_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 675 } else { 676 matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 677 } 678 } else { 679 PetscInt avgslicesize = sliceheight * a->avgslicewidth; 680 if (avgslicesize <= 432) { 681 if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) { 682 nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */ 683 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 684 } else { 685 nblocks = 1 + (nrows - 1) / sliceheight; 686 matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 687 } 688 } else if (avgslicesize <= 2400) { 689 nblocks = 1 + (nrows - 1) / sliceheight; 690 matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 691 } else { 692 nblocks = 1 + (nrows - 1) / sliceheight; 693 matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 694 } 695 } 696 break; 697 #endif 698 default: 699 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLCUDA.", cudastruct->kernelchoice); 700 } 701 PetscCall(PetscLogGpuTimeEnd()); 702 PetscCall(VecCUDARestoreArrayRead(xx, &x)); 703 PetscCall(VecCUDARestoreArrayWrite(yy, &y)); 704 PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 705 PetscFunctionReturn(PETSC_SUCCESS); 706 } 707 708 static PetscErrorCode MatMultAdd_SeqSELLCUDA(Mat A, Vec xx, Vec yy, Vec zz) 709 { 710 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 711 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 712 PetscScalar *z; 713 const PetscScalar *y, *x; 714 PetscInt nrows = A->rmap->n, sliceheight = a->sliceheight; 715 MatScalar *aval = cudastruct->val; 716 PetscInt *acolidx = cudastruct->colidx; 717 PetscInt *sliidx = cudastruct->sliidx; 718 #if !defined(PETSC_USE_COMPLEX) 719 PetscReal maxoveravg; 720 PetscInt chunksperblock, nchunks, *chunk_slice_map; 721 PetscInt blocky = cudastruct->blocky; 722 #endif 723 724 PetscFunctionBegin; 725 PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight); 726 PetscCheck(!(cudastruct->kernelchoice >= 2 && cudastruct->kernelchoice <= 6 && sliceheight != SLICE_HEIGHT), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight); 727 PetscCall(MatSeqSELLCUDACopyToGPU(A)); 728 if (a->nz) { 729 PetscInt nblocks, blocksize = 512; 730 dim3 block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32); 731 PetscCall(VecCUDAGetArrayRead(xx, &x)); 732 PetscCall(VecCUDAGetArrayRead(yy, &y)); 733 PetscCall(VecCUDAGetArrayWrite(zz, &z)); 734 PetscCall(PetscLogGpuTimeBegin()); 735 736 switch (cudastruct->kernelchoice) { 737 #if !defined(PETSC_USE_COMPLEX) 738 case 9: 739 nblocks = 1 + (nrows - 1) / sliceheight; 740 if (blocky == 2) { 741 matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 742 } else if (blocky == 4) { 743 matmultadd_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 744 } else if (blocky == 8) { 745 matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 746 } else if (blocky == 16) { 747 matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 748 } else if (blocky == 32) { 749 matmultadd_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 750 } else { 751 matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 752 } 753 break; 754 case 8: 755 /* each block handles approximately one slice */ 756 nchunks = cudastruct->totalchunks; 757 blocky = a->chunksize / 32; 758 chunksperblock = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize; 759 nblocks = 1 + (nchunks - 1) / chunksperblock; 760 chunk_slice_map = cudastruct->chunk_slice_map; 761 if (blocky == 2) { 762 matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 763 } else if (blocky == 4) { 764 matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 765 } else if (blocky == 8) { 766 matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 767 } else if (blocky == 16) { 768 matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 769 } else if (blocky == 32) { 770 matmultadd_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 771 } else { 772 matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 773 } 774 break; 775 case 7: 776 nblocks = 1 + (nrows - 1) / (2 * sliceheight); 777 if (blocky == 2) { 778 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 779 } else if (blocky == 4) { 780 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 781 } else if (blocky == 8) { 782 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 783 } else if (blocky == 16) { 784 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 785 } else if (blocky == 32) { 786 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 787 } else { 788 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 789 } 790 break; 791 #endif 792 case 6: 793 nblocks = 1 + (nrows - 1) / (blocksize / 32); 794 matmultadd_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y, z); 795 break; 796 case 5: 797 nblocks = 1 + (nrows - 1) / (blocksize / 16); 798 matmultadd_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y, z); 799 break; 800 case 4: 801 nblocks = 1 + (nrows - 1) / (blocksize / 8); 802 matmultadd_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y, z); 803 break; 804 case 3: 805 nblocks = 1 + (nrows - 1) / (blocksize / 4); 806 matmultadd_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y, z); 807 break; 808 case 2: 809 nblocks = 1 + (nrows - 1) / (blocksize / 2); 810 matmultadd_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y, z); 811 break; 812 case 1: 813 nblocks = 1 + (nrows - 1) / blocksize; 814 matmultadd_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 815 break; 816 #if !defined(PETSC_USE_COMPLEX) 817 case 0: 818 maxoveravg = a->maxslicewidth / a->avgslicewidth; 819 if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */ 820 /* each block handles approximately one slice */ 821 nchunks = cudastruct->totalchunks; 822 blocky = a->chunksize / 32; 823 chunksperblock = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize; 824 nblocks = 1 + (nchunks - 1) / chunksperblock; 825 chunk_slice_map = cudastruct->chunk_slice_map; 826 if (blocky == 2) { 827 matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 828 } else if (blocky == 4) { 829 matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 830 } else if (blocky == 8) { 831 matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 832 } else if (blocky == 16) { 833 matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 834 } else if (blocky == 32) { 835 matmultadd_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 836 } else { 837 matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 838 } 839 } else { 840 PetscInt avgslicesize = sliceheight * a->avgslicewidth; 841 if (avgslicesize <= 432) { 842 if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) { 843 nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */ 844 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 845 } else { 846 nblocks = 1 + (nrows - 1) / sliceheight; 847 matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 848 } 849 } else if (avgslicesize <= 2400) { 850 nblocks = 1 + (nrows - 1) / sliceheight; 851 matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 852 } else { 853 nblocks = 1 + (nrows - 1) / sliceheight; 854 matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 855 } 856 } 857 break; 858 #endif 859 default: 860 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLCUDA.", cudastruct->kernelchoice); 861 } 862 PetscCall(PetscLogGpuTimeEnd()); 863 PetscCall(VecCUDARestoreArrayRead(xx, &x)); 864 PetscCall(VecCUDARestoreArrayRead(yy, &y)); 865 PetscCall(VecCUDARestoreArrayWrite(zz, &z)); 866 PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 867 } else { 868 PetscCall(VecCopy(yy, zz)); 869 } 870 PetscFunctionReturn(PETSC_SUCCESS); 871 } 872 873 static PetscErrorCode MatSetFromOptions_SeqSELLCUDA(Mat A, PetscOptionItems PetscOptionsObject) 874 { 875 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 876 PetscInt kernel, blocky; 877 PetscBool flg; 878 879 PetscFunctionBegin; 880 PetscOptionsHeadBegin(PetscOptionsObject, "SeqSELLCUDA options"); 881 PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_blocky", &blocky, &flg)); 882 if (flg) { 883 PetscCheck(blocky == 2 || blocky == 4 || blocky == 8 || blocky == 16 || blocky == 32, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Unsupported blocky: %" PetscInt_FMT " it should be in {2,4,8,16,32}", blocky); 884 cudastruct->blocky = blocky; 885 } 886 PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_kernel", &kernel, &flg)); 887 if (flg) { 888 PetscCheck(kernel >= 0 && kernel <= 9, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong kernel choice: %" PetscInt_FMT " it should be in [0,9]", kernel); 889 cudastruct->kernelchoice = kernel; 890 if (kernel == 8) PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_chunksperblock", &cudastruct->chunksperblock, &flg)); 891 } 892 PetscOptionsHeadEnd(); 893 PetscFunctionReturn(PETSC_SUCCESS); 894 } 895 896 PETSC_INTERN PetscErrorCode MatAssemblyEnd_SpMV_Preprocessing_Private(Mat A) 897 { 898 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 899 900 PetscFunctionBegin; 901 PetscCall(MatSeqSELLGetAvgSliceWidth(A, &a->avgslicewidth)); 902 PetscCall(MatSeqSELLGetMaxSliceWidth(A, &a->maxslicewidth)); 903 PetscCall(MatSeqSELLGetFillRatio(A, &a->fillratio)); 904 PetscFunctionReturn(PETSC_SUCCESS); 905 } 906 907 static PetscErrorCode MatAssemblyEnd_SeqSELLCUDA(Mat A, MatAssemblyType mode) 908 { 909 PetscFunctionBegin; 910 PetscCall(MatAssemblyEnd_SeqSELL(A, mode)); 911 PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(A)); 912 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 913 if (A->factortype == MAT_FACTOR_NONE) PetscCall(MatSeqSELLCUDACopyToGPU(A)); 914 A->ops->mult = MatMult_SeqSELLCUDA; 915 A->ops->multadd = MatMultAdd_SeqSELLCUDA; 916 PetscFunctionReturn(PETSC_SUCCESS); 917 } 918 919 static PetscErrorCode MatZeroEntries_SeqSELLCUDA(Mat A) 920 { 921 PetscBool both = PETSC_FALSE; 922 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 923 924 PetscFunctionBegin; 925 if (A->factortype == MAT_FACTOR_NONE) { 926 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 927 if (cudastruct->val) { 928 both = PETSC_TRUE; 929 PetscCallCUDA(cudaMemset(cudastruct->val, 0, a->sliidx[a->totalslices] * sizeof(*cudastruct->val))); 930 } 931 } 932 PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices])); 933 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 934 else A->offloadmask = PETSC_OFFLOAD_CPU; 935 PetscFunctionReturn(PETSC_SUCCESS); 936 } 937 938 static PetscErrorCode MatDestroy_SeqSELLCUDA(Mat A) 939 { 940 PetscFunctionBegin; 941 if (A->factortype == MAT_FACTOR_NONE && A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) PetscCall(MatSeqSELLCUDA_Destroy((Mat_SeqSELLCUDA **)&A->spptr)); 942 PetscCall(MatDestroy_SeqSELL(A)); 943 PetscFunctionReturn(PETSC_SUCCESS); 944 } 945 946 PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat); 947 static PetscErrorCode MatDuplicate_SeqSELLCUDA(Mat A, MatDuplicateOption cpvalues, Mat *B) 948 { 949 PetscFunctionBegin; 950 PetscCall(MatDuplicate_SeqSELL(A, cpvalues, B)); 951 PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(*B)); 952 PetscFunctionReturn(PETSC_SUCCESS); 953 } 954 955 PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat B) 956 { 957 Mat_SeqSELLCUDA *cudastruct; 958 959 PetscFunctionBegin; 960 PetscCall(PetscFree(B->defaultvectype)); 961 PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype)); 962 963 if (!B->spptr) { 964 if (B->factortype == MAT_FACTOR_NONE) { 965 PetscCall(PetscNew(&cudastruct)); 966 B->spptr = cudastruct; 967 } 968 } 969 970 B->ops->assemblyend = MatAssemblyEnd_SeqSELLCUDA; 971 B->ops->destroy = MatDestroy_SeqSELLCUDA; 972 B->ops->setfromoptions = MatSetFromOptions_SeqSELLCUDA; 973 B->ops->mult = MatMult_SeqSELLCUDA; 974 B->ops->multadd = MatMultAdd_SeqSELLCUDA; 975 B->ops->duplicate = MatDuplicate_SeqSELLCUDA; 976 B->ops->zeroentries = MatZeroEntries_SeqSELLCUDA; 977 978 /* No need to assemble SeqSELL, but need to do the preprocessing for SpMV */ 979 PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(B)); 980 981 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELLCUDA)); 982 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 983 PetscFunctionReturn(PETSC_SUCCESS); 984 } 985 986 /*MC 987 MATSEQSELLCUDA - MATSELLCUDA = "(seq)sellcuda" - A matrix type to be used for sparse matrices on NVIDIA GPUs. 988 989 Options Database Keys: 990 + -mat_type seqsellcuda - sets the matrix type to "seqsellcuda" during a call to `MatSetFromOptions()` 991 . -mat_sell_spmv_cuda_kernel - selects a spmv kernel for MatSELLCUDA 992 - -mat_sell_spmv_cuda_blocky - sets the y dimension of the block size of the spmv kernels. These kernels use a 2D block with the x dimension being 32 993 994 Level: beginner 995 996 .seealso: [](ch_matrices), `Mat`, `MATSELLCUDA` 997 M*/ 998 999 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELLCUDA(Mat B) 1000 { 1001 PetscFunctionBegin; 1002 PetscCall(MatCreate_SeqSELL(B)); 1003 PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(B)); 1004 PetscCall(MatSetFromOptions(B)); 1005 PetscFunctionReturn(PETSC_SUCCESS); 1006 } 1007