1 #include <cuda_runtime.h> 2 3 #include <petscdevice_cuda.h> 4 #include <petsc/private/cupmatomics.hpp> 5 #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 6 7 #define SLICE_HEIGHT 16 8 9 typedef struct { 10 PetscInt maxallocmat; 11 PetscInt totalentries; 12 PetscInt *colidx; /* column index array, device pointer */ 13 MatScalar *val; /* value array, device pointer */ 14 PetscInt totalslices; 15 PetscInt *sliidx; /* slice index array, device pointer */ 16 PetscInt nonzerostate; 17 PetscInt kernelchoice; 18 PetscInt blocky; 19 PetscInt chunksperblock; 20 PetscInt totalchunks; 21 PetscInt *chunk_slice_map; /* starting slice for each chunk, device pointer */ 22 } Mat_SeqSELLCUDA; 23 24 static PetscErrorCode MatSeqSELLCUDA_Destroy(Mat_SeqSELLCUDA **cudastruct) 25 { 26 PetscFunctionBegin; 27 if (*cudastruct) { 28 if ((*cudastruct)->colidx) { PetscCallCUDA(cudaFree((*cudastruct)->colidx)); } 29 if ((*cudastruct)->val) { PetscCallCUDA(cudaFree((*cudastruct)->val)); } 30 if ((*cudastruct)->sliidx) { PetscCallCUDA(cudaFree((*cudastruct)->sliidx)); } 31 if ((*cudastruct)->chunk_slice_map) { PetscCallCUDA(cudaFree((*cudastruct)->chunk_slice_map)); } 32 PetscCall(PetscFree(*cudastruct)); 33 } 34 PetscFunctionReturn(PETSC_SUCCESS); 35 } 36 37 static PetscErrorCode MatSeqSELLCUDACopyToGPU(Mat A) 38 { 39 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 40 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 41 42 PetscFunctionBegin; 43 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 44 PetscCall(PetscLogEventBegin(MAT_CUDACopyToGPU, A, 0, 0, 0)); 45 if (A->assembled && A->nonzerostate == cudastruct->nonzerostate) { 46 /* copy values only */ 47 PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice)); 48 PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar)))); 49 } else { 50 if (cudastruct->colidx) { PetscCallCUDA(cudaFree(cudastruct->colidx)); } 51 if (cudastruct->val) { PetscCallCUDA(cudaFree(cudastruct->val)); } 52 if (cudastruct->sliidx) { PetscCallCUDA(cudaFree(cudastruct->sliidx)); } 53 if (cudastruct->chunk_slice_map) { PetscCallCUDA(cudaFree(cudastruct->chunk_slice_map)); } 54 cudastruct->maxallocmat = a->maxallocmat; 55 cudastruct->totalentries = a->sliidx[a->totalslices]; 56 cudastruct->totalslices = a->totalslices; 57 cudastruct->totalchunks = a->totalchunks; 58 PetscCallCUDA(cudaMalloc((void **)&cudastruct->colidx, a->maxallocmat * sizeof(*cudastruct->colidx))); 59 PetscCallCUDA(cudaMalloc((void **)&cudastruct->val, a->maxallocmat * sizeof(*cudastruct->val))); 60 /* copy values, nz or maxallocmat? */ 61 PetscCallCUDA(cudaMemcpy(cudastruct->colidx, a->colidx, a->sliidx[a->totalslices] * sizeof(*a->colidx), cudaMemcpyHostToDevice)); 62 PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(*a->val), cudaMemcpyHostToDevice)); 63 64 PetscCallCUDA(cudaMalloc((void **)&cudastruct->sliidx, (a->totalslices + 1) * sizeof(*cudastruct->sliidx))); 65 PetscCallCUDA(cudaMemcpy(cudastruct->sliidx, a->sliidx, (a->totalslices + 1) * sizeof(*a->sliidx), cudaMemcpyHostToDevice)); 66 PetscCallCUDA(cudaMalloc((void **)&cudastruct->chunk_slice_map, a->totalchunks * sizeof(*cudastruct->chunk_slice_map))); 67 PetscCallCUDA(cudaMemcpy(cudastruct->chunk_slice_map, a->chunk_slice_map, a->totalchunks * sizeof(*a->chunk_slice_map), cudaMemcpyHostToDevice)); 68 PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar) + sizeof(PetscInt)) + (a->totalslices + 1 + a->totalchunks) * sizeof(PetscInt))); 69 } 70 PetscCallCUDA(WaitForCUDA()); 71 PetscCall(PetscLogEventEnd(MAT_CUDACopyToGPU, A, 0, 0, 0)); 72 A->offloadmask = PETSC_OFFLOAD_BOTH; 73 } 74 PetscFunctionReturn(PETSC_SUCCESS); 75 } 76 77 static __global__ void matmult_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 78 { 79 PetscInt i, row, slice_id, row_in_slice; 80 MatScalar sum; 81 /* one thread per row. */ 82 row = blockIdx.x * blockDim.x + threadIdx.x; 83 if (row < nrows) { 84 slice_id = row / sliceheight; 85 row_in_slice = row % sliceheight; 86 sum = 0.0; 87 for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]]; 88 y[row] = sum; 89 } 90 } 91 92 static __global__ void matmultadd_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 93 { 94 PetscInt i, row, slice_id, row_in_slice; 95 MatScalar sum; 96 /* one thread per row. */ 97 row = blockIdx.x * blockDim.x + threadIdx.x; 98 if (row < nrows) { 99 slice_id = row / sliceheight; 100 row_in_slice = row % sliceheight; 101 sum = 0.0; 102 for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]]; 103 z[row] = y[row] + sum; 104 } 105 } 106 107 #if !defined(PETSC_USE_COMPLEX) 108 /* use 1 block per slice, suitable for large slice width */ 109 template <int BLOCKY> 110 __global__ void matmult_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 111 { 112 __shared__ MatScalar shared[32][BLOCKY]; 113 PetscInt i, row, slice_id = blockIdx.x; 114 int tid = threadIdx.x + threadIdx.y * 32; 115 /* transposed index */ 116 int tidx = tid % BLOCKY; 117 int tidy = tid / BLOCKY; 118 PetscScalar t = 0.0; 119 120 row = slice_id * sliceheight + threadIdx.x % sliceheight; 121 if (row < nrows) { 122 for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]]; 123 } 124 #pragma unroll 125 for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 126 /* transpose layout to reduce each row using warp shfl */ 127 if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t; 128 __syncthreads(); 129 if (tidy < sliceheight) t = shared[tidy][tidx]; 130 #pragma unroll 131 for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); } 132 if (tidx == 0 && tidy < sliceheight) { shared[0][tidy] = t; } 133 __syncthreads(); 134 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) { y[row] = shared[0][threadIdx.x]; } 135 } 136 137 /* use 1 block per slice, suitable for large slice width */ 138 template <int BLOCKY> 139 __global__ void matmultadd_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 140 { 141 __shared__ MatScalar shared[32][BLOCKY]; 142 PetscInt i, row, slice_id = blockIdx.x; 143 int tid = threadIdx.x + threadIdx.y * 32; 144 /* transposed index */ 145 int tidx = tid % BLOCKY; 146 int tidy = tid / BLOCKY; 147 PetscScalar t = 0.0; 148 149 row = slice_id * sliceheight + threadIdx.x % sliceheight; 150 if (row < nrows) { 151 for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]]; 152 } 153 #pragma unroll 154 for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 155 /* transpose layout to reduce each row using warp shfl */ 156 if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t; 157 __syncthreads(); 158 if (tidy < sliceheight) t = shared[tidy][tidx]; 159 #pragma unroll 160 for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); } 161 if (tidx == 0 && tidy < sliceheight) { shared[0][tidy] = t; } 162 __syncthreads(); 163 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) { z[row] = y[row] + shared[0][threadIdx.x]; } 164 } 165 166 template <int BLOCKY> 167 __device__ __forceinline__ static bool segment_scan(PetscInt flag[], MatScalar shared[], PetscScalar *val) 168 { 169 bool head = true; 170 #pragma unroll 171 for (int i = 1; i < BLOCKY * 2; i <<= 1) { 172 int halfwarpid = threadIdx.y * 2 + threadIdx.x / 16; 173 shared[threadIdx.x + threadIdx.y * 32] = 0; 174 if (halfwarpid >= i && flag[halfwarpid - i] == flag[halfwarpid]) { 175 shared[threadIdx.x + threadIdx.y * 32] = *val; 176 if (i == 1) head = false; 177 } 178 __syncthreads(); 179 if (halfwarpid < BLOCKY * 2 - i) *val += shared[threadIdx.x + threadIdx.y * 32 + i * 16]; 180 __syncthreads(); 181 } 182 return head; 183 } 184 185 /* load-balancing version. Chunksize is equal to the number of threads per block */ 186 template <int BLOCKY> 187 __global__ void matmult_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 188 { 189 __shared__ MatScalar shared[BLOCKY * 32]; 190 PetscInt gid, row, start_slice, cid; 191 PetscScalar t = 0.0; 192 AtomicAdd<MatScalar> atomAdd; 193 /* zero out y */ 194 for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * 32 * BLOCKY); iter++) { 195 gid = gridDim.x * 32 * BLOCKY * iter + blockIdx.x * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x; 196 if (gid < nrows) y[gid] = 0.0; 197 } 198 for (int iter = 0; iter < chunksperblock; iter++) { 199 cid = blockIdx.x * chunksperblock + iter; /* chunk id */ 200 if (cid < totalchunks) { 201 start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */ 202 gid = cid * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x; 203 if ((cid + 1) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */ 204 __shared__ PetscInt flag[BLOCKY * 2]; 205 bool write; 206 PetscInt slice_id = start_slice, totalslices = PetscCeilInt(nrows, sliceheight), totalentries = sliidx[totalslices]; 207 /* find out the slice that this element belongs to */ 208 while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++; 209 if (threadIdx.x % 16 == 0) flag[threadIdx.y * 2 + threadIdx.x / 16] = slice_id; 210 row = slice_id * sliceheight + threadIdx.x % sliceheight; 211 if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]]; 212 __syncthreads(); 213 write = segment_scan<BLOCKY>(flag, shared, &t); 214 if (row < nrows && gid < totalentries && write) atomAdd(y[row], t); 215 t = 0.0; 216 } else { /* this iteration covers only one slice */ 217 row = start_slice * sliceheight + threadIdx.x % sliceheight; 218 if (row < nrows) t += aval[gid] * x[acolidx[gid]]; 219 if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */ 220 int tid = threadIdx.x + threadIdx.y * 32, tidx = tid % BLOCKY, tidy = tid / BLOCKY; 221 /* reduction and write to output vector */ 222 #pragma unroll 223 for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 224 /* transpose layout to reduce each row using warp shfl */ 225 if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */ 226 __syncthreads(); 227 if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */ 228 #pragma unroll 229 for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); } 230 if (tidx == 0 && tidy < sliceheight) { shared[tidy] = t; /* shared[0][tidy] = t */ } 231 __syncthreads(); 232 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(y[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */ 233 t = 0.0; 234 } 235 } 236 } 237 } 238 } 239 240 /* load-balancing version. Chunksize is equal to the number of threads per block */ 241 template <int BLOCKY> 242 __global__ void matmultadd_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 243 { 244 __shared__ MatScalar shared[BLOCKY * 32]; 245 PetscInt gid, row, start_slice, cid; 246 PetscScalar t = 0.0; 247 AtomicAdd<MatScalar> atomAdd; 248 /* copy y to z */ 249 for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * 32 * BLOCKY); iter++) { 250 gid = gridDim.x * 32 * BLOCKY * iter + blockIdx.x * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x; 251 if (gid < nrows) z[gid] = y[gid]; 252 } 253 for (int iter = 0; iter < chunksperblock; iter++) { 254 cid = blockIdx.x * chunksperblock + iter; /* chunk id */ 255 if (cid < totalchunks) { 256 start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */ 257 gid = cid * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x; 258 if ((cid + 1) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */ 259 __shared__ PetscInt flag[BLOCKY * 2]; 260 bool write; 261 PetscInt slice_id = start_slice, totalslices = PetscCeilInt(nrows, sliceheight), totalentries = sliidx[totalslices]; 262 /* find out the slice that this element belongs to */ 263 while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++; 264 if (threadIdx.x % 16 == 0) flag[threadIdx.y * 2 + threadIdx.x / 16] = slice_id; 265 row = slice_id * sliceheight + threadIdx.x % sliceheight; 266 if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]]; 267 __syncthreads(); 268 write = segment_scan<BLOCKY>(flag, shared, &t); 269 if (row < nrows && gid < totalentries && write) atomAdd(z[row], t); 270 t = 0.0; 271 } else { /* this iteration covers only one slice */ 272 row = start_slice * sliceheight + threadIdx.x % sliceheight; 273 if (row < nrows) t += aval[gid] * x[acolidx[gid]]; 274 if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */ 275 int tid = threadIdx.x + threadIdx.y * 32, tidx = tid % BLOCKY, tidy = tid / BLOCKY; 276 /* reduction and write to output vector */ 277 #pragma unroll 278 for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 279 /* transpose layout to reduce each row using warp shfl */ 280 if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */ 281 __syncthreads(); 282 if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */ 283 #pragma unroll 284 for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); } 285 if (tidx == 0 && tidy < sliceheight) { shared[tidy] = t; /* shared[0][tidy] = t */ } 286 __syncthreads(); 287 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(z[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */ 288 t = 0.0; 289 } 290 } 291 } 292 } 293 } 294 295 /* use 1 warp per slice, suitable for small slice width */ 296 static __global__ void matmult_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 297 { 298 PetscInt i, row, slice_id; 299 slice_id = blockIdx.x * blockDim.y + threadIdx.y; 300 row = slice_id * sliceheight + threadIdx.x % sliceheight; 301 double t = 0.0; 302 if (row < nrows) { 303 for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]]; 304 } 305 #pragma unroll 306 for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 307 if (row < nrows && threadIdx.x < sliceheight) { y[row] = t; } 308 } 309 310 /* use 1 warp per slice, suitable for small slice width */ 311 static __global__ void matmultadd_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 312 { 313 PetscInt i, row, slice_id; 314 slice_id = blockIdx.x * blockDim.y + threadIdx.y; 315 row = slice_id * sliceheight + threadIdx.x % sliceheight; 316 double t = 0.0; 317 if (row < nrows) { 318 for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]]; 319 } 320 #pragma unroll 321 for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 322 if (row < nrows && threadIdx.x < sliceheight) { z[row] = y[row] + t; } 323 } 324 #endif 325 326 /*********** Kernel 2-6 are tied to slice height 16. They are kept only for performance comparison **********/ 327 328 static __global__ void matmult_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 329 { 330 __shared__ MatScalar shared[512]; 331 PetscInt i, row, slice_id, row_in_slice; 332 /* multiple threads per row. */ 333 row = blockIdx.x * blockDim.x + threadIdx.x; 334 if (row < nrows) { 335 slice_id = row / SLICE_HEIGHT; 336 row_in_slice = row % SLICE_HEIGHT; 337 338 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 339 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 340 __syncthreads(); 341 if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; } 342 __syncthreads(); 343 if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 344 __syncthreads(); 345 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 346 __syncthreads(); 347 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 348 __syncthreads(); 349 if (threadIdx.y < 1) { 350 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 351 y[row] = shared[threadIdx.x]; 352 } 353 } 354 } 355 356 static __global__ void matmult_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 357 { 358 __shared__ MatScalar shared[512]; 359 PetscInt i, row, slice_id, row_in_slice; 360 /* multiple threads per row. */ 361 row = blockIdx.x * blockDim.x + threadIdx.x; 362 if (row < nrows) { 363 slice_id = row / SLICE_HEIGHT; 364 row_in_slice = row % SLICE_HEIGHT; 365 366 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 367 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 368 __syncthreads(); 369 if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 370 __syncthreads(); 371 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 372 __syncthreads(); 373 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 374 __syncthreads(); 375 if (threadIdx.y < 1) { 376 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 377 y[row] = shared[threadIdx.x]; 378 } 379 } 380 } 381 382 static __global__ void matmult_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 383 { 384 __shared__ MatScalar shared[512]; 385 PetscInt i, row, slice_id, row_in_slice; 386 /* multiple threads per row. */ 387 row = blockIdx.x * blockDim.x + threadIdx.x; 388 if (row < nrows) { 389 slice_id = row / SLICE_HEIGHT; 390 row_in_slice = row % SLICE_HEIGHT; 391 392 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 393 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 394 __syncthreads(); 395 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 396 __syncthreads(); 397 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 398 __syncthreads(); 399 if (threadIdx.y < 1) { 400 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 401 y[row] = shared[threadIdx.x]; 402 } 403 } 404 } 405 406 static __global__ void matmult_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 407 { 408 __shared__ MatScalar shared[512]; 409 PetscInt i, row, slice_id, row_in_slice; 410 /* multiple threads per row. */ 411 row = blockIdx.x * blockDim.x + threadIdx.x; 412 if (row < nrows) { 413 slice_id = row / SLICE_HEIGHT; 414 row_in_slice = row % SLICE_HEIGHT; 415 416 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 417 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 418 __syncthreads(); 419 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 420 __syncthreads(); 421 if (threadIdx.y < 1) { 422 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 423 y[row] = shared[threadIdx.x]; 424 } 425 } 426 } 427 428 static __global__ void matmult_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 429 { 430 __shared__ MatScalar shared[512]; 431 PetscInt i, row, slice_id, row_in_slice; 432 /* multiple threads per row. */ 433 row = blockIdx.x * blockDim.x + threadIdx.x; 434 if (row < nrows) { 435 slice_id = row / SLICE_HEIGHT; 436 row_in_slice = row % SLICE_HEIGHT; 437 438 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 439 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 440 __syncthreads(); 441 if (threadIdx.y < 1) { 442 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 443 y[row] = shared[threadIdx.x]; 444 } 445 } 446 } 447 448 static __global__ void matmultadd_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 449 { 450 __shared__ MatScalar shared[512]; 451 PetscInt i, row, slice_id, row_in_slice; 452 /* multiple threads per row. */ 453 row = blockIdx.x * blockDim.x + threadIdx.x; 454 if (row < nrows) { 455 slice_id = row / SLICE_HEIGHT; 456 row_in_slice = row % SLICE_HEIGHT; 457 458 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 459 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 460 __syncthreads(); 461 if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; } 462 __syncthreads(); 463 if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 464 __syncthreads(); 465 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 466 __syncthreads(); 467 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 468 __syncthreads(); 469 if (threadIdx.y < 1) { 470 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 471 z[row] = y[row] + shared[threadIdx.x]; 472 } 473 } 474 } 475 476 static __global__ void matmultadd_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 477 { 478 __shared__ MatScalar shared[512]; 479 PetscInt i, row, slice_id, row_in_slice; 480 /* multiple threads per row. */ 481 row = blockIdx.x * blockDim.x + threadIdx.x; 482 if (row < nrows) { 483 slice_id = row / SLICE_HEIGHT; 484 row_in_slice = row % SLICE_HEIGHT; 485 486 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 487 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 488 __syncthreads(); 489 if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 490 __syncthreads(); 491 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 492 __syncthreads(); 493 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 494 __syncthreads(); 495 if (threadIdx.y < 1) { 496 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 497 z[row] = y[row] + shared[threadIdx.x]; 498 } 499 } 500 } 501 502 static __global__ void matmultadd_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 503 { 504 __shared__ MatScalar shared[512]; 505 PetscInt i, row, slice_id, row_in_slice; 506 /* multiple threads per row. */ 507 row = blockIdx.x * blockDim.x + threadIdx.x; 508 if (row < nrows) { 509 slice_id = row / SLICE_HEIGHT; 510 row_in_slice = row % SLICE_HEIGHT; 511 512 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 513 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 514 __syncthreads(); 515 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 516 __syncthreads(); 517 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 518 __syncthreads(); 519 if (threadIdx.y < 1) { 520 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 521 z[row] = y[row] + shared[threadIdx.x]; 522 } 523 } 524 } 525 526 static __global__ void matmultadd_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 527 { 528 __shared__ MatScalar shared[512]; 529 PetscInt i, row, slice_id, row_in_slice; 530 /* multiple threads per row. */ 531 row = blockIdx.x * blockDim.x + threadIdx.x; 532 if (row < nrows) { 533 slice_id = row / SLICE_HEIGHT; 534 row_in_slice = row % SLICE_HEIGHT; 535 536 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 537 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 538 __syncthreads(); 539 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 540 __syncthreads(); 541 if (threadIdx.y < 1) { 542 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 543 z[row] = y[row] + shared[threadIdx.x]; 544 } 545 } 546 } 547 548 static __global__ void matmultadd_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 549 { 550 __shared__ MatScalar shared[512]; 551 PetscInt i, row, slice_id, row_in_slice; 552 /* multiple threads per row. */ 553 row = blockIdx.x * blockDim.x + threadIdx.x; 554 if (row < nrows) { 555 slice_id = row / SLICE_HEIGHT; 556 row_in_slice = row % SLICE_HEIGHT; 557 558 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 559 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 560 __syncthreads(); 561 if (threadIdx.y < 1) { 562 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 563 z[row] = y[row] + shared[threadIdx.x]; 564 } 565 } 566 } 567 568 static PetscErrorCode MatMult_SeqSELLCUDA(Mat A, Vec xx, Vec yy) 569 { 570 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 571 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 572 PetscScalar *y; 573 const PetscScalar *x; 574 PetscInt nrows = A->rmap->n, sliceheight = a->sliceheight; 575 MatScalar *aval; 576 PetscInt *acolidx; 577 PetscInt *sliidx; 578 PetscInt nblocks, blocksize = 512; /* blocksize must be multiple of SLICE_HEIGHT*32 */ 579 dim3 block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32); 580 #if !defined(PETSC_USE_COMPLEX) 581 PetscInt chunksperblock, nchunks, *chunk_slice_map; 582 PetscReal maxoveravg; 583 #endif 584 585 PetscFunctionBegin; 586 PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight); 587 PetscCheck(!(cudastruct->kernelchoice >= 2 && cudastruct->kernelchoice <= 6 && sliceheight != SLICE_HEIGHT), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight); 588 PetscCall(MatSeqSELLCUDACopyToGPU(A)); 589 /* cudastruct may not be available until MatSeqSELLCUDACopyToGPU() is called */ 590 aval = cudastruct->val; 591 acolidx = cudastruct->colidx; 592 sliidx = cudastruct->sliidx; 593 594 PetscCall(VecCUDAGetArrayRead(xx, &x)); 595 PetscCall(VecCUDAGetArrayWrite(yy, &y)); 596 PetscCall(PetscLogGpuTimeBegin()); 597 598 switch (cudastruct->kernelchoice) { 599 #if !defined(PETSC_USE_COMPLEX) 600 case 9: 601 nblocks = 1 + (nrows - 1) / sliceheight; 602 if (cudastruct->blocky == 2) { 603 matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 604 } else if (cudastruct->blocky == 4) { 605 matmult_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 606 } else if (cudastruct->blocky == 8) { 607 matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 608 } else if (cudastruct->blocky == 16) { 609 matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 610 } else if (cudastruct->blocky == 32) { 611 matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 612 } else { 613 matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 614 } 615 break; 616 case 7: 617 nblocks = 1 + (nrows - 1) / (2 * sliceheight); 618 if (cudastruct->blocky == 2) { 619 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 620 } else if (cudastruct->blocky == 4) { 621 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 622 } else if (cudastruct->blocky == 8) { 623 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 624 } else if (cudastruct->blocky == 16) { 625 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 626 } else if (cudastruct->blocky == 32) { 627 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 628 } else { 629 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 630 } 631 break; 632 #endif 633 case 6: 634 nblocks = 1 + (nrows - 1) / (blocksize / 32); /* 1 slice per block if blocksize=512 */ 635 matmult_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y); 636 break; 637 case 5: 638 nblocks = 1 + (nrows - 1) / (blocksize / 16); /* 2 slices per block if blocksize=512*/ 639 matmult_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y); 640 break; 641 case 4: 642 nblocks = 1 + (nrows - 1) / (blocksize / 8); /* 4 slices per block if blocksize=512 */ 643 matmult_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y); 644 break; 645 case 3: 646 nblocks = 1 + (nrows - 1) / (blocksize / 4); /* 8 slices per block if blocksize=512 */ 647 matmult_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y); 648 break; 649 case 2: /* 16 slices per block if blocksize=512 */ 650 nblocks = 1 + (nrows - 1) / (blocksize / 2); 651 matmult_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y); 652 break; 653 case 1: /* 32 slices per block if blocksize=512 */ 654 nblocks = 1 + (nrows - 1) / blocksize; 655 matmult_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 656 break; 657 #if !defined(PETSC_USE_COMPLEX) 658 case 0: 659 maxoveravg = a->maxslicewidth / a->avgslicewidth; 660 if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */ 661 /* each block handles approximately one slice */ 662 PetscInt blocky = a->chunksize / 32; 663 nchunks = cudastruct->totalchunks; 664 chunksperblock = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize; 665 nblocks = 1 + (nchunks - 1) / chunksperblock; 666 chunk_slice_map = cudastruct->chunk_slice_map; 667 if (blocky == 2) { 668 matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 669 } else if (blocky == 4) { 670 matmult_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 671 } else if (blocky == 8) { 672 matmult_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 673 } else if (blocky == 16) { 674 matmult_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 675 } else if (blocky == 32) { 676 matmult_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 677 } else { 678 matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 679 } 680 } else { 681 PetscInt avgslicesize = sliceheight * a->avgslicewidth; 682 if (avgslicesize <= 432) { 683 if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) { 684 nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */ 685 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 686 } else { 687 nblocks = 1 + (nrows - 1) / sliceheight; 688 matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 689 } 690 } else if (avgslicesize <= 2400) { 691 nblocks = 1 + (nrows - 1) / sliceheight; 692 matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 693 } else { 694 nblocks = 1 + (nrows - 1) / sliceheight; 695 matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 696 } 697 } 698 break; 699 #endif 700 default: 701 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLCUDA.", cudastruct->kernelchoice); 702 } 703 PetscCall(PetscLogGpuTimeEnd()); 704 PetscCall(VecCUDARestoreArrayRead(xx, &x)); 705 PetscCall(VecCUDARestoreArrayWrite(yy, &y)); 706 PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 707 PetscFunctionReturn(PETSC_SUCCESS); 708 } 709 710 static PetscErrorCode MatMultAdd_SeqSELLCUDA(Mat A, Vec xx, Vec yy, Vec zz) 711 { 712 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 713 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 714 PetscScalar *z; 715 const PetscScalar *y, *x; 716 PetscInt nrows = A->rmap->n, sliceheight = a->sliceheight; 717 MatScalar *aval = cudastruct->val; 718 PetscInt *acolidx = cudastruct->colidx; 719 PetscInt *sliidx = cudastruct->sliidx; 720 #if !defined(PETSC_USE_COMPLEX) 721 PetscReal maxoveravg; 722 PetscInt chunksperblock, nchunks, *chunk_slice_map; 723 PetscInt blocky = cudastruct->blocky; 724 #endif 725 726 PetscFunctionBegin; 727 PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight); 728 PetscCheck(!(cudastruct->kernelchoice >= 2 && cudastruct->kernelchoice <= 6 && sliceheight != SLICE_HEIGHT), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight); 729 PetscCall(MatSeqSELLCUDACopyToGPU(A)); 730 if (a->nz) { 731 PetscInt nblocks, blocksize = 512; 732 dim3 block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32); 733 PetscCall(VecCUDAGetArrayRead(xx, &x)); 734 PetscCall(VecCUDAGetArrayRead(yy, &y)); 735 PetscCall(VecCUDAGetArrayWrite(zz, &z)); 736 PetscCall(PetscLogGpuTimeBegin()); 737 738 switch (cudastruct->kernelchoice) { 739 #if !defined(PETSC_USE_COMPLEX) 740 case 9: 741 nblocks = 1 + (nrows - 1) / sliceheight; 742 if (blocky == 2) { 743 matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 744 } else if (blocky == 4) { 745 matmultadd_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 746 } else if (blocky == 8) { 747 matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 748 } else if (blocky == 16) { 749 matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 750 } else if (blocky == 32) { 751 matmultadd_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 752 } else { 753 matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 754 } 755 break; 756 case 8: 757 /* each block handles approximately one slice */ 758 nchunks = cudastruct->totalchunks; 759 blocky = a->chunksize / 32; 760 chunksperblock = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize; 761 nblocks = 1 + (nchunks - 1) / chunksperblock; 762 chunk_slice_map = cudastruct->chunk_slice_map; 763 if (blocky == 2) { 764 matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 765 } else if (blocky == 4) { 766 matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 767 } else if (blocky == 8) { 768 matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 769 } else if (blocky == 16) { 770 matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 771 } else if (blocky == 32) { 772 matmultadd_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 773 } else { 774 matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 775 } 776 break; 777 case 7: 778 nblocks = 1 + (nrows - 1) / (2 * sliceheight); 779 if (blocky == 2) { 780 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 781 } else if (blocky == 4) { 782 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 783 } else if (blocky == 8) { 784 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 785 } else if (blocky == 16) { 786 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 787 } else if (blocky == 32) { 788 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 789 } else { 790 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 791 } 792 break; 793 #endif 794 case 6: 795 nblocks = 1 + (nrows - 1) / (blocksize / 32); 796 matmultadd_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y, z); 797 break; 798 case 5: 799 nblocks = 1 + (nrows - 1) / (blocksize / 16); 800 matmultadd_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y, z); 801 break; 802 case 4: 803 nblocks = 1 + (nrows - 1) / (blocksize / 8); 804 matmultadd_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y, z); 805 break; 806 case 3: 807 nblocks = 1 + (nrows - 1) / (blocksize / 4); 808 matmultadd_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y, z); 809 break; 810 case 2: 811 nblocks = 1 + (nrows - 1) / (blocksize / 2); 812 matmultadd_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y, z); 813 break; 814 case 1: 815 nblocks = 1 + (nrows - 1) / blocksize; 816 matmultadd_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 817 break; 818 #if !defined(PETSC_USE_COMPLEX) 819 case 0: 820 maxoveravg = a->maxslicewidth / a->avgslicewidth; 821 if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */ 822 /* each block handles approximately one slice */ 823 nchunks = cudastruct->totalchunks; 824 blocky = a->chunksize / 32; 825 chunksperblock = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize; 826 nblocks = 1 + (nchunks - 1) / chunksperblock; 827 chunk_slice_map = cudastruct->chunk_slice_map; 828 if (blocky == 2) { 829 matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 830 } else if (blocky == 4) { 831 matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 832 } else if (blocky == 8) { 833 matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 834 } else if (blocky == 16) { 835 matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 836 } else if (blocky == 32) { 837 matmultadd_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 838 } else { 839 matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 840 } 841 } else { 842 PetscInt avgslicesize = sliceheight * a->avgslicewidth; 843 if (avgslicesize <= 432) { 844 if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) { 845 nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */ 846 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 847 } else { 848 nblocks = 1 + (nrows - 1) / sliceheight; 849 matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 850 } 851 } else if (avgslicesize <= 2400) { 852 nblocks = 1 + (nrows - 1) / sliceheight; 853 matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 854 } else { 855 nblocks = 1 + (nrows - 1) / sliceheight; 856 matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 857 } 858 } 859 break; 860 #endif 861 default: 862 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLCUDA.", cudastruct->kernelchoice); 863 } 864 PetscCall(PetscLogGpuTimeEnd()); 865 PetscCall(VecCUDARestoreArrayRead(xx, &x)); 866 PetscCall(VecCUDARestoreArrayRead(yy, &y)); 867 PetscCall(VecCUDARestoreArrayWrite(zz, &z)); 868 PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 869 } else { 870 PetscCall(VecCopy(yy, zz)); 871 } 872 PetscFunctionReturn(PETSC_SUCCESS); 873 } 874 875 static PetscErrorCode MatSetFromOptions_SeqSELLCUDA(Mat A, PetscOptionItems *PetscOptionsObject) 876 { 877 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 878 PetscInt kernel, blocky; 879 PetscBool flg; 880 881 PetscFunctionBegin; 882 PetscOptionsHeadBegin(PetscOptionsObject, "SeqSELLCUDA options"); 883 PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_blocky", &blocky, &flg)); 884 if (flg) { 885 PetscCheck(blocky == 2 || blocky == 4 || blocky == 8 || blocky == 16 || blocky == 32, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Unsupported blocky: %" PetscInt_FMT " it should be in {2,4,8,16,32}", blocky); 886 cudastruct->blocky = blocky; 887 } 888 PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_kernel", &kernel, &flg)); 889 if (flg) { 890 PetscCheck(kernel >= 0 && kernel <= 9, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong kernel choice: %" PetscInt_FMT " it should be in [0,9]", kernel); 891 cudastruct->kernelchoice = kernel; 892 if (kernel == 8) { PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_chunksperblock", &cudastruct->chunksperblock, &flg)); } 893 } 894 PetscOptionsHeadEnd(); 895 PetscFunctionReturn(PETSC_SUCCESS); 896 } 897 898 PETSC_INTERN PetscErrorCode MatAssemblyEnd_SpMV_Preprocessing_Private(Mat A) 899 { 900 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 901 902 PetscFunctionBegin; 903 PetscCall(MatSeqSELLGetAvgSliceWidth(A, &a->avgslicewidth)); 904 PetscCall(MatSeqSELLGetMaxSliceWidth(A, &a->maxslicewidth)); 905 PetscCall(MatSeqSELLGetFillRatio(A, &a->fillratio)); 906 PetscFunctionReturn(PETSC_SUCCESS); 907 } 908 909 static PetscErrorCode MatAssemblyEnd_SeqSELLCUDA(Mat A, MatAssemblyType mode) 910 { 911 PetscFunctionBegin; 912 PetscCall(MatAssemblyEnd_SeqSELL(A, mode)); 913 PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(A)); 914 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 915 if (A->factortype == MAT_FACTOR_NONE) { PetscCall(MatSeqSELLCUDACopyToGPU(A)); } 916 A->ops->mult = MatMult_SeqSELLCUDA; 917 A->ops->multadd = MatMultAdd_SeqSELLCUDA; 918 PetscFunctionReturn(PETSC_SUCCESS); 919 } 920 921 static PetscErrorCode MatZeroEntries_SeqSELLCUDA(Mat A) 922 { 923 PetscBool both = PETSC_FALSE; 924 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 925 926 PetscFunctionBegin; 927 if (A->factortype == MAT_FACTOR_NONE) { 928 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 929 if (cudastruct->val) { 930 both = PETSC_TRUE; 931 PetscCallCUDA(cudaMemset(cudastruct->val, 0, a->sliidx[a->totalslices] * sizeof(*cudastruct->val))); 932 } 933 } 934 PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices])); 935 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 936 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 937 else A->offloadmask = PETSC_OFFLOAD_CPU; 938 PetscFunctionReturn(PETSC_SUCCESS); 939 } 940 941 static PetscErrorCode MatDestroy_SeqSELLCUDA(Mat A) 942 { 943 PetscFunctionBegin; 944 if (A->factortype == MAT_FACTOR_NONE && A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) PetscCall(MatSeqSELLCUDA_Destroy((Mat_SeqSELLCUDA **)&A->spptr)); 945 PetscCall(MatDestroy_SeqSELL(A)); 946 PetscFunctionReturn(PETSC_SUCCESS); 947 } 948 949 PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat); 950 static PetscErrorCode MatDuplicate_SeqSELLCUDA(Mat A, MatDuplicateOption cpvalues, Mat *B) 951 { 952 PetscFunctionBegin; 953 PetscCall(MatDuplicate_SeqSELL(A, cpvalues, B)); 954 PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(*B)); 955 PetscFunctionReturn(PETSC_SUCCESS); 956 } 957 958 PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat B) 959 { 960 Mat_SeqSELLCUDA *cudastruct; 961 962 PetscFunctionBegin; 963 PetscCall(PetscFree(B->defaultvectype)); 964 PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype)); 965 966 if (!B->spptr) { 967 if (B->factortype == MAT_FACTOR_NONE) { 968 PetscCall(PetscNew(&cudastruct)); 969 B->spptr = cudastruct; 970 } 971 } 972 973 B->ops->assemblyend = MatAssemblyEnd_SeqSELLCUDA; 974 B->ops->destroy = MatDestroy_SeqSELLCUDA; 975 B->ops->setfromoptions = MatSetFromOptions_SeqSELLCUDA; 976 B->ops->mult = MatMult_SeqSELLCUDA; 977 B->ops->multadd = MatMultAdd_SeqSELLCUDA; 978 B->ops->duplicate = MatDuplicate_SeqSELLCUDA; 979 B->ops->zeroentries = MatZeroEntries_SeqSELLCUDA; 980 981 /* No need to assemble SeqSELL, but need to do the preprocessing for SpMV */ 982 PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(B)); 983 984 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELLCUDA)); 985 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 986 PetscFunctionReturn(PETSC_SUCCESS); 987 } 988 989 /*MC 990 MATSEQSELLCUDA - MATSELLCUDA = "(seq)sellcuda" - A matrix type to be used for sparse matrices on NVIDIA GPUs. 991 992 Options Database Keys: 993 + -mat_type seqsellcuda - sets the matrix type to "seqsellcuda" during a call to `MatSetFromOptions()` 994 . -mat_sell_spmv_cuda_kernel - selects a spmv kernel for MatSELLCUDA 995 - -mat_sell_spmv_cuda_blocky - sets the y dimension of the block size of the spmv kernels. These kernels use a 2D block with the x dimension being 32 996 997 Level: beginner 998 999 .seealso: [](ch_matrices), `Mat`, `MATSELLCUDA` 1000 M*/ 1001 1002 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELLCUDA(Mat B) 1003 { 1004 PetscFunctionBegin; 1005 PetscCall(MatCreate_SeqSELL(B)); 1006 PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(B)); 1007 PetscCall(MatSetFromOptions(B)); 1008 PetscFunctionReturn(PETSC_SUCCESS); 1009 } 1010