1 #include <cuda_runtime.h> 2 3 #include <petscdevice_cuda.h> 4 #include <petsc/private/cupmatomics.hpp> 5 #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 6 7 #define SLICE_HEIGHT 16 8 9 typedef struct { 10 PetscInt maxallocmat; 11 PetscInt totalentries; 12 PetscInt *colidx; /* column index array, device pointer */ 13 MatScalar *val; /* value array, device pointer */ 14 PetscInt totalslices; 15 PetscInt *sliidx; /* slice index array, device pointer */ 16 PetscInt nonzerostate; 17 PetscInt kernelchoice; 18 PetscInt blocky; 19 PetscInt chunksperblock; 20 PetscInt totalchunks; 21 PetscInt *chunk_slice_map; /* starting slice for each chunk, device pointer */ 22 } Mat_SeqSELLCUDA; 23 24 static PetscErrorCode MatSeqSELLCUDA_Destroy(Mat_SeqSELLCUDA **cudastruct) 25 { 26 PetscFunctionBegin; 27 if (*cudastruct) { 28 if ((*cudastruct)->colidx) { PetscCallCUDA(cudaFree((*cudastruct)->colidx)); } 29 if ((*cudastruct)->val) { PetscCallCUDA(cudaFree((*cudastruct)->val)); } 30 if ((*cudastruct)->sliidx) { PetscCallCUDA(cudaFree((*cudastruct)->sliidx)); } 31 if ((*cudastruct)->chunk_slice_map) { PetscCallCUDA(cudaFree((*cudastruct)->chunk_slice_map)); } 32 PetscCall(PetscFree(*cudastruct)); 33 } 34 PetscFunctionReturn(PETSC_SUCCESS); 35 } 36 37 static PetscErrorCode MatSeqSELLCUDACopyToGPU(Mat A) 38 { 39 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 40 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 41 42 PetscFunctionBegin; 43 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 44 PetscCall(PetscLogEventBegin(MAT_CUDACopyToGPU, A, 0, 0, 0)); 45 if (A->assembled && A->nonzerostate == cudastruct->nonzerostate) { 46 /* copy values only */ 47 PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice)); 48 PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar)))); 49 } else { 50 if (cudastruct->colidx) { PetscCallCUDA(cudaFree(cudastruct->colidx)); } 51 if (cudastruct->val) { PetscCallCUDA(cudaFree(cudastruct->val)); } 52 if (cudastruct->sliidx) { PetscCallCUDA(cudaFree(cudastruct->sliidx)); } 53 if (cudastruct->chunk_slice_map) { PetscCallCUDA(cudaFree(cudastruct->chunk_slice_map)); } 54 cudastruct->maxallocmat = a->maxallocmat; 55 cudastruct->totalentries = a->sliidx[a->totalslices]; 56 cudastruct->totalslices = a->totalslices; 57 cudastruct->totalchunks = a->totalchunks; 58 PetscCallCUDA(cudaMalloc((void **)&(cudastruct->colidx), a->maxallocmat * sizeof(*(cudastruct->colidx)))); 59 PetscCallCUDA(cudaMalloc((void **)&(cudastruct->val), a->maxallocmat * sizeof(*(cudastruct->val)))); 60 /* copy values, nz or maxallocmat? */ 61 PetscCallCUDA(cudaMemcpy(cudastruct->colidx, a->colidx, a->sliidx[a->totalslices] * sizeof(*(a->colidx)), cudaMemcpyHostToDevice)); 62 PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(*(a->val)), cudaMemcpyHostToDevice)); 63 64 PetscCallCUDA(cudaMalloc((void **)&(cudastruct->sliidx), (a->totalslices + 1) * sizeof(*(cudastruct->sliidx)))); 65 PetscCallCUDA(cudaMemcpy(cudastruct->sliidx, a->sliidx, (a->totalslices + 1) * sizeof(*(a->sliidx)), cudaMemcpyHostToDevice)); 66 PetscCallCUDA(cudaMalloc((void **)&(cudastruct->chunk_slice_map), a->totalchunks * sizeof(*(cudastruct->chunk_slice_map)))); 67 PetscCallCUDA(cudaMemcpy(cudastruct->chunk_slice_map, a->chunk_slice_map, a->totalchunks * sizeof(*(a->chunk_slice_map)), cudaMemcpyHostToDevice)); 68 PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar) + sizeof(PetscInt)) + (a->totalslices + 1 + a->totalchunks) * sizeof(PetscInt))); 69 } 70 PetscCallCUDA(WaitForCUDA()); 71 PetscCall(PetscLogEventEnd(MAT_CUDACopyToGPU, A, 0, 0, 0)); 72 A->offloadmask = PETSC_OFFLOAD_BOTH; 73 } 74 PetscFunctionReturn(PETSC_SUCCESS); 75 } 76 77 __global__ void matmult_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 78 { 79 PetscInt i, row, slice_id, row_in_slice; 80 MatScalar sum; 81 /* one thread per row. */ 82 row = blockIdx.x * blockDim.x + threadIdx.x; 83 if (row < nrows) { 84 slice_id = row / sliceheight; 85 row_in_slice = row % sliceheight; 86 sum = 0.0; 87 for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]]; 88 y[row] = sum; 89 } 90 } 91 92 __global__ void matmultadd_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 93 { 94 PetscInt i, row, slice_id, row_in_slice; 95 MatScalar sum; 96 /* one thread per row. */ 97 row = blockIdx.x * blockDim.x + threadIdx.x; 98 if (row < nrows) { 99 slice_id = row / sliceheight; 100 row_in_slice = row % sliceheight; 101 sum = 0.0; 102 for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]]; 103 z[row] = y[row] + sum; 104 } 105 } 106 107 #if !defined(PETSC_USE_COMPLEX) 108 /* use 1 block per slice, suitable for large slice width */ 109 template <int BLOCKY> 110 __global__ void matmult_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 111 { 112 __shared__ MatScalar shared[32][BLOCKY]; 113 PetscInt i, row, slice_id = blockIdx.x; 114 int tid = threadIdx.x + threadIdx.y * 32; 115 /* transposed index */ 116 int tidx = tid % BLOCKY; 117 int tidy = tid / BLOCKY; 118 PetscScalar t = 0.0; 119 120 row = slice_id * sliceheight + threadIdx.x % sliceheight; 121 if (row < nrows) { 122 for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]]; 123 } 124 #pragma unroll 125 for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 126 /* transpose layout to reduce each row using warp shfl */ 127 if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t; 128 __syncthreads(); 129 if (tidy < sliceheight) t = shared[tidy][tidx]; 130 #pragma unroll 131 for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); } 132 if (tidx == 0 && tidy < sliceheight) { shared[0][tidy] = t; } 133 __syncthreads(); 134 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) { y[row] = shared[0][threadIdx.x]; } 135 } 136 137 /* use 1 block per slice, suitable for large slice width */ 138 template <int BLOCKY> 139 __global__ void matmultadd_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 140 { 141 __shared__ MatScalar shared[32][BLOCKY]; 142 PetscInt i, row, slice_id = blockIdx.x; 143 int tid = threadIdx.x + threadIdx.y * 32; 144 /* transposed index */ 145 int tidx = tid % BLOCKY; 146 int tidy = tid / BLOCKY; 147 PetscScalar t = 0.0; 148 149 row = slice_id * sliceheight + threadIdx.x % sliceheight; 150 if (row < nrows) { 151 for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]]; 152 } 153 #pragma unroll 154 for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 155 /* transpose layout to reduce each row using warp shfl */ 156 if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t; 157 __syncthreads(); 158 if (tidy < sliceheight) t = shared[tidy][tidx]; 159 #pragma unroll 160 for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); } 161 if (tidx == 0 && tidy < sliceheight) { shared[0][tidy] = t; } 162 __syncthreads(); 163 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) { z[row] = y[row] + shared[0][threadIdx.x]; } 164 } 165 166 template <int BLOCKY> 167 __device__ __forceinline__ bool segment_scan(PetscInt flag[], MatScalar shared[], PetscScalar *val) 168 { 169 bool head = true; 170 #pragma unroll 171 for (int i = 1; i < BLOCKY * 2; i <<= 1) { 172 int halfwarpid = threadIdx.y * 2 + threadIdx.x / 16; 173 shared[threadIdx.x + threadIdx.y * 32] = 0; 174 if (halfwarpid >= i && flag[halfwarpid - i] == flag[halfwarpid]) { 175 shared[threadIdx.x + threadIdx.y * 32] = *val; 176 if (i == 1) head = false; 177 } 178 __syncthreads(); 179 if (halfwarpid < BLOCKY * 2 - i) *val += shared[threadIdx.x + threadIdx.y * 32 + i * 16]; 180 __syncthreads(); 181 } 182 return head; 183 } 184 185 /* load-balancing version. Chunksize is equal to the number of threads per block */ 186 template <int BLOCKY> 187 __global__ void matmult_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 188 { 189 __shared__ MatScalar shared[BLOCKY * 32]; 190 PetscInt gid, row, start_slice, cid; 191 PetscScalar t = 0.0; 192 AtomicAdd<MatScalar> atomAdd; 193 /* zero out y */ 194 for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * 32 * BLOCKY); iter++) { 195 gid = gridDim.x * 32 * BLOCKY * iter + blockIdx.x * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x; 196 if (gid < nrows) y[gid] = 0.0; 197 } 198 for (int iter = 0; iter < chunksperblock; iter++) { 199 cid = blockIdx.x * chunksperblock + iter; /* chunk id */ 200 if (cid < totalchunks) { 201 start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */ 202 gid = cid * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x; 203 if ((cid + 1) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */ 204 __shared__ PetscInt flag[BLOCKY * 2]; 205 bool write; 206 PetscInt slice_id = start_slice, totalslices = PetscCeilInt(nrows, sliceheight), totalentries = sliidx[totalslices]; 207 /* find out the slice that this element belongs to */ 208 while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++; 209 if (threadIdx.x % 16 == 0) flag[threadIdx.y * 2 + threadIdx.x / 16] = slice_id; 210 row = slice_id * sliceheight + threadIdx.x % sliceheight; 211 if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]]; 212 __syncthreads(); 213 write = segment_scan<BLOCKY>(flag, shared, &t); 214 if (row < nrows && gid < totalentries && write) atomAdd(y[row], t); 215 t = 0.0; 216 } else { /* this iteration covers only one slice */ 217 row = start_slice * sliceheight + threadIdx.x % sliceheight; 218 if (row < nrows) t += aval[gid] * x[acolidx[gid]]; 219 if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */ 220 int tid = threadIdx.x + threadIdx.y * 32, tidx = tid % BLOCKY, tidy = tid / BLOCKY; 221 /* reduction and write to output vector */ 222 #pragma unroll 223 for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 224 /* transpose layout to reduce each row using warp shfl */ 225 if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */ 226 __syncthreads(); 227 if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */ 228 #pragma unroll 229 for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); } 230 if (tidx == 0 && tidy < sliceheight) { shared[tidy] = t; /* shared[0][tidy] = t */ } 231 __syncthreads(); 232 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(y[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */ 233 t = 0.0; 234 } 235 } 236 } 237 } 238 } 239 240 /* load-balancing version. Chunksize is equal to the number of threads per block */ 241 template <int BLOCKY> 242 __global__ void matmultadd_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 243 { 244 __shared__ MatScalar shared[BLOCKY * 32]; 245 PetscInt gid, row, start_slice, cid; 246 PetscScalar t = 0.0; 247 AtomicAdd<MatScalar> atomAdd; 248 /* copy y to z */ 249 for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * 32 * BLOCKY); iter++) { 250 gid = gridDim.x * 32 * BLOCKY * iter + blockIdx.x * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x; 251 if (gid < nrows) z[gid] = y[gid]; 252 } 253 for (int iter = 0; iter < chunksperblock; iter++) { 254 cid = blockIdx.x * chunksperblock + iter; /* chunk id */ 255 if (cid < totalchunks) { 256 start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */ 257 gid = cid * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x; 258 if ((cid + 1) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */ 259 __shared__ PetscInt flag[BLOCKY * 2]; 260 bool write; 261 PetscInt slice_id = start_slice, totalslices = PetscCeilInt(nrows, sliceheight), totalentries = sliidx[totalslices]; 262 /* find out the slice that this element belongs to */ 263 while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++; 264 if (threadIdx.x % 16 == 0) flag[threadIdx.y * 2 + threadIdx.x / 16] = slice_id; 265 row = slice_id * sliceheight + threadIdx.x % sliceheight; 266 if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]]; 267 __syncthreads(); 268 write = segment_scan<BLOCKY>(flag, shared, &t); 269 if (row < nrows && gid < totalentries && write) atomAdd(z[row], t); 270 t = 0.0; 271 } else { /* this iteration covers only one slice */ 272 row = start_slice * sliceheight + threadIdx.x % sliceheight; 273 if (row < nrows) t += aval[gid] * x[acolidx[gid]]; 274 if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */ 275 int tid = threadIdx.x + threadIdx.y * 32, tidx = tid % BLOCKY, tidy = tid / BLOCKY; 276 /* reduction and write to output vector */ 277 #pragma unroll 278 for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 279 /* transpose layout to reduce each row using warp shfl */ 280 if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */ 281 __syncthreads(); 282 if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */ 283 #pragma unroll 284 for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); } 285 if (tidx == 0 && tidy < sliceheight) { shared[tidy] = t; /* shared[0][tidy] = t */ } 286 __syncthreads(); 287 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(z[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */ 288 t = 0.0; 289 } 290 } 291 } 292 } 293 } 294 295 /* use 1 warp per slice, suitable for small slice width */ 296 __global__ void matmult_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 297 { 298 PetscInt i, row, slice_id; 299 slice_id = blockIdx.x * blockDim.y + threadIdx.y; 300 row = slice_id * sliceheight + threadIdx.x % sliceheight; 301 double t = 0.0; 302 if (row < nrows) { 303 for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]]; 304 } 305 #pragma unroll 306 for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 307 if (row < nrows && threadIdx.x < sliceheight) { y[row] = t; } 308 } 309 310 /* use 1 warp per slice, suitable for small slice width */ 311 __global__ void matmultadd_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 312 { 313 PetscInt i, row, slice_id; 314 slice_id = blockIdx.x * blockDim.y + threadIdx.y; 315 row = slice_id * sliceheight + threadIdx.x % sliceheight; 316 double t = 0.0; 317 if (row < nrows) { 318 for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]]; 319 } 320 #pragma unroll 321 for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 322 if (row < nrows && threadIdx.x < sliceheight) { z[row] = y[row] + t; } 323 } 324 #endif 325 326 /*********** Kernel 2-6 are tied to slice height 16. They are kept only for performance comparison **********/ 327 328 __global__ void matmult_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 329 { 330 __shared__ MatScalar shared[512]; 331 PetscInt i, row, slice_id, row_in_slice; 332 /* multiple threads per row. */ 333 row = blockIdx.x * blockDim.x + threadIdx.x; 334 if (row < nrows) { 335 slice_id = row / SLICE_HEIGHT; 336 row_in_slice = row % SLICE_HEIGHT; 337 338 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 339 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 340 __syncthreads(); 341 if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; } 342 __syncthreads(); 343 if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 344 __syncthreads(); 345 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 346 __syncthreads(); 347 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 348 __syncthreads(); 349 if (threadIdx.y < 1) { 350 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 351 y[row] = shared[threadIdx.x]; 352 } 353 } 354 } 355 356 __global__ void matmult_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 357 { 358 __shared__ MatScalar shared[512]; 359 PetscInt i, row, slice_id, row_in_slice; 360 /* multiple threads per row. */ 361 row = blockIdx.x * blockDim.x + threadIdx.x; 362 if (row < nrows) { 363 slice_id = row / SLICE_HEIGHT; 364 row_in_slice = row % SLICE_HEIGHT; 365 366 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 367 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 368 __syncthreads(); 369 if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 370 __syncthreads(); 371 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 372 __syncthreads(); 373 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 374 __syncthreads(); 375 if (threadIdx.y < 1) { 376 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 377 y[row] = shared[threadIdx.x]; 378 } 379 } 380 } 381 382 __global__ void matmult_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 383 { 384 __shared__ MatScalar shared[512]; 385 PetscInt i, row, slice_id, row_in_slice; 386 /* multiple threads per row. */ 387 row = blockIdx.x * blockDim.x + threadIdx.x; 388 if (row < nrows) { 389 slice_id = row / SLICE_HEIGHT; 390 row_in_slice = row % SLICE_HEIGHT; 391 392 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 393 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 394 __syncthreads(); 395 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 396 __syncthreads(); 397 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 398 __syncthreads(); 399 if (threadIdx.y < 1) { 400 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 401 y[row] = shared[threadIdx.x]; 402 } 403 } 404 } 405 406 __global__ void matmult_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 407 { 408 __shared__ MatScalar shared[512]; 409 PetscInt i, row, slice_id, row_in_slice; 410 /* multiple threads per row. */ 411 row = blockIdx.x * blockDim.x + threadIdx.x; 412 if (row < nrows) { 413 slice_id = row / SLICE_HEIGHT; 414 row_in_slice = row % SLICE_HEIGHT; 415 416 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 417 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 418 __syncthreads(); 419 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 420 __syncthreads(); 421 if (threadIdx.y < 1) { 422 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 423 y[row] = shared[threadIdx.x]; 424 } 425 } 426 } 427 428 __global__ void matmult_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 429 { 430 __shared__ MatScalar shared[512]; 431 PetscInt i, row, slice_id, row_in_slice; 432 /* multiple threads per row. */ 433 row = blockIdx.x * blockDim.x + threadIdx.x; 434 if (row < nrows) { 435 slice_id = row / SLICE_HEIGHT; 436 row_in_slice = row % SLICE_HEIGHT; 437 438 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 439 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 440 __syncthreads(); 441 if (threadIdx.y < 1) { 442 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 443 y[row] = shared[threadIdx.x]; 444 } 445 } 446 } 447 448 __global__ void matmultadd_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 449 { 450 __shared__ MatScalar shared[512]; 451 PetscInt i, row, slice_id, row_in_slice; 452 /* multiple threads per row. */ 453 row = blockIdx.x * blockDim.x + threadIdx.x; 454 if (row < nrows) { 455 slice_id = row / SLICE_HEIGHT; 456 row_in_slice = row % SLICE_HEIGHT; 457 458 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 459 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 460 __syncthreads(); 461 if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; } 462 __syncthreads(); 463 if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 464 __syncthreads(); 465 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 466 __syncthreads(); 467 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 468 __syncthreads(); 469 if (threadIdx.y < 1) { 470 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 471 z[row] = y[row] + shared[threadIdx.x]; 472 } 473 } 474 } 475 476 __global__ void matmultadd_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 477 { 478 __shared__ MatScalar shared[512]; 479 PetscInt i, row, slice_id, row_in_slice; 480 /* multiple threads per row. */ 481 row = blockIdx.x * blockDim.x + threadIdx.x; 482 if (row < nrows) { 483 slice_id = row / SLICE_HEIGHT; 484 row_in_slice = row % SLICE_HEIGHT; 485 486 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 487 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 488 __syncthreads(); 489 if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 490 __syncthreads(); 491 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 492 __syncthreads(); 493 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 494 __syncthreads(); 495 if (threadIdx.y < 1) { 496 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 497 z[row] = y[row] + shared[threadIdx.x]; 498 } 499 } 500 } 501 502 __global__ void matmultadd_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 503 { 504 __shared__ MatScalar shared[512]; 505 PetscInt i, row, slice_id, row_in_slice; 506 /* multiple threads per row. */ 507 row = blockIdx.x * blockDim.x + threadIdx.x; 508 if (row < nrows) { 509 slice_id = row / SLICE_HEIGHT; 510 row_in_slice = row % SLICE_HEIGHT; 511 512 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 513 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 514 __syncthreads(); 515 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 516 __syncthreads(); 517 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 518 __syncthreads(); 519 if (threadIdx.y < 1) { 520 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 521 z[row] = y[row] + shared[threadIdx.x]; 522 } 523 } 524 } 525 526 __global__ void matmultadd_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 527 { 528 __shared__ MatScalar shared[512]; 529 PetscInt i, row, slice_id, row_in_slice; 530 /* multiple threads per row. */ 531 row = blockIdx.x * blockDim.x + threadIdx.x; 532 if (row < nrows) { 533 slice_id = row / SLICE_HEIGHT; 534 row_in_slice = row % SLICE_HEIGHT; 535 536 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 537 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 538 __syncthreads(); 539 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 540 __syncthreads(); 541 if (threadIdx.y < 1) { 542 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 543 z[row] = y[row] + shared[threadIdx.x]; 544 } 545 } 546 } 547 548 __global__ void matmultadd_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 549 { 550 __shared__ MatScalar shared[512]; 551 PetscInt i, row, slice_id, row_in_slice; 552 /* multiple threads per row. */ 553 row = blockIdx.x * blockDim.x + threadIdx.x; 554 if (row < nrows) { 555 slice_id = row / SLICE_HEIGHT; 556 row_in_slice = row % SLICE_HEIGHT; 557 558 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 559 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 560 __syncthreads(); 561 if (threadIdx.y < 1) { 562 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 563 z[row] = y[row] + shared[threadIdx.x]; 564 } 565 } 566 } 567 568 PetscErrorCode MatMult_SeqSELLCUDA(Mat A, Vec xx, Vec yy) 569 { 570 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 571 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 572 PetscScalar *y; 573 const PetscScalar *x; 574 PetscInt nrows = A->rmap->n, sliceheight = a->sliceheight; 575 PetscInt chunksperblock, nchunks, *chunk_slice_map; 576 MatScalar *aval; 577 PetscInt *acolidx; 578 PetscInt *sliidx; 579 PetscInt nblocks, blocksize = 512; /* blocksize must be multiple of SLICE_HEIGHT*32 */ 580 dim3 block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32); 581 PetscReal maxoveravg; 582 583 PetscFunctionBegin; 584 PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight); 585 PetscCheck(!(cudastruct->kernelchoice >= 2 && cudastruct->kernelchoice <= 6 && sliceheight != SLICE_HEIGHT), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight); 586 PetscCall(MatSeqSELLCUDACopyToGPU(A)); 587 /* cudastruct may not be available until MatSeqSELLCUDACopyToGPU() is called */ 588 aval = cudastruct->val; 589 acolidx = cudastruct->colidx; 590 sliidx = cudastruct->sliidx; 591 592 PetscCall(VecCUDAGetArrayRead(xx, &x)); 593 PetscCall(VecCUDAGetArrayWrite(yy, &y)); 594 PetscCall(PetscLogGpuTimeBegin()); 595 596 switch (cudastruct->kernelchoice) { 597 #if !defined(PETSC_USE_COMPLEX) 598 case 9: 599 nblocks = 1 + (nrows - 1) / sliceheight; 600 if (cudastruct->blocky == 2) { 601 matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 602 } else if (cudastruct->blocky == 4) { 603 matmult_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 604 } else if (cudastruct->blocky == 8) { 605 matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 606 } else if (cudastruct->blocky == 16) { 607 matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 608 } else if (cudastruct->blocky == 32) { 609 matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 610 } else { 611 matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 612 } 613 break; 614 case 7: 615 nblocks = 1 + (nrows - 1) / (2 * sliceheight); 616 if (cudastruct->blocky == 2) { 617 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 618 } else if (cudastruct->blocky == 4) { 619 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 620 } else if (cudastruct->blocky == 8) { 621 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 622 } else if (cudastruct->blocky == 16) { 623 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 624 } else if (cudastruct->blocky == 32) { 625 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 626 } else { 627 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 628 } 629 break; 630 #endif 631 case 6: 632 nblocks = 1 + (nrows - 1) / (blocksize / 32); /* 1 slice per block if blocksize=512 */ 633 matmult_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y); 634 break; 635 case 5: 636 nblocks = 1 + (nrows - 1) / (blocksize / 16); /* 2 slices per block if blocksize=512*/ 637 matmult_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y); 638 break; 639 case 4: 640 nblocks = 1 + (nrows - 1) / (blocksize / 8); /* 4 slices per block if blocksize=512 */ 641 matmult_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y); 642 break; 643 case 3: 644 nblocks = 1 + (nrows - 1) / (blocksize / 4); /* 8 slices per block if blocksize=512 */ 645 matmult_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y); 646 break; 647 case 2: /* 16 slices per block if blocksize=512 */ 648 nblocks = 1 + (nrows - 1) / (blocksize / 2); 649 matmult_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y); 650 break; 651 case 1: /* 32 slices per block if blocksize=512 */ 652 nblocks = 1 + (nrows - 1) / blocksize; 653 matmult_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 654 break; 655 #if !defined(PETSC_USE_COMPLEX) 656 case 0: 657 maxoveravg = a->maxslicewidth / a->avgslicewidth; 658 if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */ 659 /* each block handles approximately one slice */ 660 PetscInt blocky = a->chunksize / 32; 661 nchunks = cudastruct->totalchunks; 662 chunksperblock = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize; 663 nblocks = 1 + (nchunks - 1) / chunksperblock; 664 chunk_slice_map = cudastruct->chunk_slice_map; 665 if (blocky == 2) { 666 matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 667 } else if (blocky == 4) { 668 matmult_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 669 } else if (blocky == 8) { 670 matmult_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 671 } else if (blocky == 16) { 672 matmult_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 673 } else if (blocky == 32) { 674 matmult_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 675 } else { 676 matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 677 } 678 } else { 679 PetscInt avgslicesize = sliceheight * a->avgslicewidth; 680 if (avgslicesize <= 432) { 681 if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) { 682 nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */ 683 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 684 } else { 685 nblocks = 1 + (nrows - 1) / sliceheight; 686 matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 687 } 688 } else if (avgslicesize <= 2400) { 689 nblocks = 1 + (nrows - 1) / sliceheight; 690 matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 691 } else { 692 nblocks = 1 + (nrows - 1) / sliceheight; 693 matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 694 } 695 } 696 break; 697 #endif 698 default: 699 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLCUDA.", cudastruct->kernelchoice); 700 } 701 PetscCall(PetscLogGpuTimeEnd()); 702 PetscCall(VecCUDARestoreArrayRead(xx, &x)); 703 PetscCall(VecCUDARestoreArrayWrite(yy, &y)); 704 PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 705 PetscFunctionReturn(PETSC_SUCCESS); 706 } 707 708 PetscErrorCode MatMultAdd_SeqSELLCUDA(Mat A, Vec xx, Vec yy, Vec zz) 709 { 710 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 711 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 712 PetscScalar *z; 713 const PetscScalar *y, *x; 714 PetscInt nrows = A->rmap->n, sliceheight = a->sliceheight; 715 PetscInt chunksperblock, nchunks, *chunk_slice_map; 716 MatScalar *aval = cudastruct->val; 717 PetscInt *acolidx = cudastruct->colidx; 718 PetscInt *sliidx = cudastruct->sliidx; 719 PetscReal maxoveravg; 720 721 PetscFunctionBegin; 722 PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight); 723 PetscCheck(!(cudastruct->kernelchoice >= 2 && cudastruct->kernelchoice <= 6 && sliceheight != SLICE_HEIGHT), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight); 724 PetscCall(MatSeqSELLCUDACopyToGPU(A)); 725 if (a->nz) { 726 PetscInt blocky = cudastruct->blocky, nblocks, blocksize = 512; 727 dim3 block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32); 728 PetscCall(VecCUDAGetArrayRead(xx, &x)); 729 PetscCall(VecCUDAGetArrayRead(yy, &y)); 730 PetscCall(VecCUDAGetArrayWrite(zz, &z)); 731 PetscCall(PetscLogGpuTimeBegin()); 732 733 switch (cudastruct->kernelchoice) { 734 #if !defined(PETSC_USE_COMPLEX) 735 case 9: 736 nblocks = 1 + (nrows - 1) / sliceheight; 737 if (blocky == 2) { 738 matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 739 } else if (blocky == 4) { 740 matmultadd_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 741 } else if (blocky == 8) { 742 matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 743 } else if (blocky == 16) { 744 matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 745 } else if (blocky == 32) { 746 matmultadd_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 747 } else { 748 matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 749 } 750 break; 751 case 8: 752 /* each block handles approximately one slice */ 753 nchunks = cudastruct->totalchunks; 754 blocky = a->chunksize / 32; 755 chunksperblock = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize; 756 nblocks = 1 + (nchunks - 1) / chunksperblock; 757 chunk_slice_map = cudastruct->chunk_slice_map; 758 if (blocky == 2) { 759 matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 760 } else if (blocky == 4) { 761 matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 762 } else if (blocky == 8) { 763 matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 764 } else if (blocky == 16) { 765 matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 766 } else if (blocky == 32) { 767 matmultadd_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 768 } else { 769 matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 770 } 771 break; 772 case 7: 773 nblocks = 1 + (nrows - 1) / (2 * sliceheight); 774 if (blocky == 2) { 775 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 776 } else if (blocky == 4) { 777 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 778 } else if (blocky == 8) { 779 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 780 } else if (blocky == 16) { 781 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 782 } else if (blocky == 32) { 783 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 784 } else { 785 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 786 } 787 break; 788 #endif 789 case 6: 790 nblocks = 1 + (nrows - 1) / (blocksize / 32); 791 matmultadd_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y, z); 792 break; 793 case 5: 794 nblocks = 1 + (nrows - 1) / (blocksize / 16); 795 matmultadd_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y, z); 796 break; 797 case 4: 798 nblocks = 1 + (nrows - 1) / (blocksize / 8); 799 matmultadd_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y, z); 800 break; 801 case 3: 802 nblocks = 1 + (nrows - 1) / (blocksize / 4); 803 matmultadd_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y, z); 804 break; 805 case 2: 806 nblocks = 1 + (nrows - 1) / (blocksize / 2); 807 matmultadd_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y, z); 808 break; 809 case 1: 810 nblocks = 1 + (nrows - 1) / blocksize; 811 matmultadd_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 812 break; 813 #if !defined(PETSC_USE_COMPLEX) 814 case 0: 815 maxoveravg = a->maxslicewidth / a->avgslicewidth; 816 if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */ 817 /* each block handles approximately one slice */ 818 nchunks = cudastruct->totalchunks; 819 blocky = a->chunksize / 32; 820 chunksperblock = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize; 821 nblocks = 1 + (nchunks - 1) / chunksperblock; 822 chunk_slice_map = cudastruct->chunk_slice_map; 823 if (blocky == 2) { 824 matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 825 } else if (blocky == 4) { 826 matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 827 } else if (blocky == 8) { 828 matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 829 } else if (blocky == 16) { 830 matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 831 } else if (blocky == 32) { 832 matmultadd_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 833 } else { 834 matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 835 } 836 } else { 837 PetscInt avgslicesize = sliceheight * a->avgslicewidth; 838 if (avgslicesize <= 432) { 839 if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) { 840 nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */ 841 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 842 } else { 843 nblocks = 1 + (nrows - 1) / sliceheight; 844 matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 845 } 846 } else if (avgslicesize <= 2400) { 847 nblocks = 1 + (nrows - 1) / sliceheight; 848 matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 849 } else { 850 nblocks = 1 + (nrows - 1) / sliceheight; 851 matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 852 } 853 } 854 break; 855 #endif 856 default: 857 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLCUDA.", cudastruct->kernelchoice); 858 } 859 PetscCall(PetscLogGpuTimeEnd()); 860 PetscCall(VecCUDARestoreArrayRead(xx, &x)); 861 PetscCall(VecCUDARestoreArrayRead(yy, &y)); 862 PetscCall(VecCUDARestoreArrayWrite(zz, &z)); 863 PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 864 } else { 865 PetscCall(VecCopy(yy, zz)); 866 } 867 PetscFunctionReturn(PETSC_SUCCESS); 868 } 869 870 static PetscErrorCode MatSetFromOptions_SeqSELLCUDA(Mat A, PetscOptionItems *PetscOptionsObject) 871 { 872 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 873 PetscInt kernel, blocky; 874 PetscBool flg; 875 876 PetscFunctionBegin; 877 PetscOptionsHeadBegin(PetscOptionsObject, "SeqSELLCUDA options"); 878 PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_blocky", &blocky, &flg)); 879 if (flg) { 880 PetscCheck(blocky == 2 || blocky == 4 || blocky == 8 || blocky == 16 || blocky == 32, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Unsupported blocky: %" PetscInt_FMT " it should be in {2,4,8,16,32}", blocky); 881 cudastruct->blocky = blocky; 882 } 883 PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_kernel", &kernel, &flg)); 884 if (flg) { 885 PetscCheck(kernel >= 0 && kernel <= 9, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong kernel choice: %" PetscInt_FMT " it should be in [0,9]", kernel); 886 cudastruct->kernelchoice = kernel; 887 if (kernel == 8) { PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_chunksperblock", &cudastruct->chunksperblock, &flg)); } 888 } 889 PetscOptionsHeadEnd(); 890 PetscFunctionReturn(PETSC_SUCCESS); 891 } 892 893 PETSC_INTERN PetscErrorCode MatAssemblyEnd_SpMV_Preprocessing_Private(Mat A) 894 { 895 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 896 897 PetscFunctionBegin; 898 PetscCall(MatSeqSELLGetAvgSliceWidth(A, &a->avgslicewidth)); 899 PetscCall(MatSeqSELLGetMaxSliceWidth(A, &a->maxslicewidth)); 900 PetscCall(MatSeqSELLGetFillRatio(A, &a->fillratio)); 901 PetscFunctionReturn(PETSC_SUCCESS); 902 } 903 904 static PetscErrorCode MatAssemblyEnd_SeqSELLCUDA(Mat A, MatAssemblyType mode) 905 { 906 PetscFunctionBegin; 907 PetscCall(MatAssemblyEnd_SeqSELL(A, mode)); 908 PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(A)); 909 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 910 if (A->factortype == MAT_FACTOR_NONE) { PetscCall(MatSeqSELLCUDACopyToGPU(A)); } 911 A->ops->mult = MatMult_SeqSELLCUDA; 912 A->ops->multadd = MatMultAdd_SeqSELLCUDA; 913 PetscFunctionReturn(PETSC_SUCCESS); 914 } 915 916 static PetscErrorCode MatZeroEntries_SeqSELLCUDA(Mat A) 917 { 918 PetscBool both = PETSC_FALSE; 919 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 920 921 PetscFunctionBegin; 922 if (A->factortype == MAT_FACTOR_NONE) { 923 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 924 if (cudastruct->val) { 925 both = PETSC_TRUE; 926 PetscCallCUDA(cudaMemset(cudastruct->val, 0, a->sliidx[a->totalslices] * sizeof(*(cudastruct->val)))); 927 } 928 } 929 PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices])); 930 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 931 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 932 else A->offloadmask = PETSC_OFFLOAD_CPU; 933 PetscFunctionReturn(PETSC_SUCCESS); 934 } 935 936 static PetscErrorCode MatDestroy_SeqSELLCUDA(Mat A) 937 { 938 PetscFunctionBegin; 939 if (A->factortype == MAT_FACTOR_NONE && A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) PetscCall(MatSeqSELLCUDA_Destroy((Mat_SeqSELLCUDA **)&A->spptr)); 940 PetscCall(MatDestroy_SeqSELL(A)); 941 PetscFunctionReturn(PETSC_SUCCESS); 942 } 943 944 PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat); 945 static PetscErrorCode MatDuplicate_SeqSELLCUDA(Mat A, MatDuplicateOption cpvalues, Mat *B) 946 { 947 PetscFunctionBegin; 948 PetscCall(MatDuplicate_SeqSELL(A, cpvalues, B)); 949 PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(*B)); 950 PetscFunctionReturn(PETSC_SUCCESS); 951 } 952 953 PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat B) 954 { 955 Mat_SeqSELLCUDA *cudastruct; 956 957 PetscFunctionBegin; 958 PetscCall(PetscFree(B->defaultvectype)); 959 PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype)); 960 961 if (!B->spptr) { 962 if (B->factortype == MAT_FACTOR_NONE) { 963 PetscCall(PetscNew(&cudastruct)); 964 B->spptr = cudastruct; 965 } 966 } 967 968 B->ops->assemblyend = MatAssemblyEnd_SeqSELLCUDA; 969 B->ops->destroy = MatDestroy_SeqSELLCUDA; 970 B->ops->setfromoptions = MatSetFromOptions_SeqSELLCUDA; 971 B->ops->mult = MatMult_SeqSELLCUDA; 972 B->ops->multadd = MatMultAdd_SeqSELLCUDA; 973 B->ops->duplicate = MatDuplicate_SeqSELLCUDA; 974 B->ops->zeroentries = MatZeroEntries_SeqSELLCUDA; 975 976 /* No need to assemble SeqSELL, but need to do the preprocessing for SpMV */ 977 PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(B)); 978 979 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELLCUDA)); 980 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 981 PetscFunctionReturn(PETSC_SUCCESS); 982 } 983 984 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELLCUDA(Mat B) 985 { 986 PetscFunctionBegin; 987 PetscCall(MatCreate_SeqSELL(B)); 988 PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(B)); 989 PetscCall(MatSetFromOptions(B)); 990 PetscFunctionReturn(PETSC_SUCCESS); 991 } 992