1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 7 8 #include <petscconf.h> 9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10 #include <../src/mat/impls/sbaij/seq/sbaij.h> 11 #include <../src/vec/vec/impls/dvecimpl.h> 12 #include <petsc/private/vecimpl.h> 13 #undef VecType 14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15 #include <thrust/adjacent_difference.h> 16 #if PETSC_CPP_VERSION >= 14 17 #define PETSC_HAVE_THRUST_ASYNC 1 18 // thrust::for_each(thrust::cuda::par.on()) requires C++14 19 #include <thrust/async/for_each.h> 20 #endif 21 #include <thrust/iterator/constant_iterator.h> 22 #include <thrust/remove.h> 23 #include <thrust/sort.h> 24 #include <thrust/unique.h> 25 26 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0}; 27 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 28 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 29 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 30 31 typedef enum { 32 CUSPARSE_MV_ALG_DEFAULT = 0, 33 CUSPARSE_COOMV_ALG = 1, 34 CUSPARSE_CSRMV_ALG1 = 2, 35 CUSPARSE_CSRMV_ALG2 = 3 36 } cusparseSpMVAlg_t; 37 38 typedef enum { 39 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 40 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 41 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 42 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 43 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 44 CUSPARSE_SPMM_ALG_DEFAULT = 0, 45 CUSPARSE_SPMM_COO_ALG1 = 1, 46 CUSPARSE_SPMM_COO_ALG2 = 2, 47 CUSPARSE_SPMM_COO_ALG3 = 3, 48 CUSPARSE_SPMM_COO_ALG4 = 5, 49 CUSPARSE_SPMM_CSR_ALG1 = 4, 50 CUSPARSE_SPMM_CSR_ALG2 = 6, 51 } cusparseSpMMAlg_t; 52 53 typedef enum { 54 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 55 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 56 } cusparseCsr2CscAlg_t; 57 */ 58 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0}; 59 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0}; 60 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0}; 61 #endif 62 63 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 64 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 65 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *); 66 67 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *); 68 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *); 69 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *); 70 71 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec); 72 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 73 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 74 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 75 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject); 76 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure); 77 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar); 78 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec); 79 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 80 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 81 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 82 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 83 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 84 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool); 85 86 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **); 87 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **); 88 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat); 89 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **); 90 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **); 91 92 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 93 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool); 94 95 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]); 96 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]); 97 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode); 98 99 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 100 { 101 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 102 103 PetscFunctionBegin; 104 switch (op) { 105 case MAT_CUSPARSE_MULT: 106 cusparsestruct->format = format; 107 break; 108 case MAT_CUSPARSE_ALL: 109 cusparsestruct->format = format; 110 break; 111 default: 112 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op); 113 } 114 PetscFunctionReturn(0); 115 } 116 117 /*@ 118 MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular 119 operation. Only the `MatMult()` operation can use different GPU storage formats 120 121 Not Collective 122 123 Input Parameters: 124 + A - Matrix of type `MATSEQAIJCUSPARSE` 125 . op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`. `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`, 126 `MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`. 127 - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.) 128 129 Output Parameter: 130 131 Level: intermediate 132 133 .seealso: `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 134 @*/ 135 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 136 { 137 PetscFunctionBegin; 138 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 139 PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format)); 140 PetscFunctionReturn(0); 141 } 142 143 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu) 144 { 145 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 146 147 PetscFunctionBegin; 148 cusparsestruct->use_cpu_solve = use_cpu; 149 PetscFunctionReturn(0); 150 } 151 152 /*@ 153 MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`. 154 155 Input Parameters: 156 + A - Matrix of type `MATSEQAIJCUSPARSE` 157 - use_cpu - set flag for using the built-in CPU `MatSolve()` 158 159 Output Parameter: 160 161 Note: 162 The cuSparse LU solver currently computes the factors with the built-in CPU method 163 and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 164 This method to specify if the solve is done on the CPU or GPU (GPU is the default). 165 166 Level: intermediate 167 168 .seealso: `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 169 @*/ 170 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu) 171 { 172 PetscFunctionBegin; 173 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 174 PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu)); 175 PetscFunctionReturn(0); 176 } 177 178 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg) 179 { 180 PetscFunctionBegin; 181 switch (op) { 182 case MAT_FORM_EXPLICIT_TRANSPOSE: 183 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 184 if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 185 A->form_explicit_transpose = flg; 186 break; 187 default: 188 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 189 break; 190 } 191 PetscFunctionReturn(0); 192 } 193 194 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 195 196 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 197 { 198 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 199 IS isrow = b->row, iscol = b->col; 200 PetscBool row_identity, col_identity; 201 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)B->spptr; 202 203 PetscFunctionBegin; 204 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 205 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 206 B->offloadmask = PETSC_OFFLOAD_CPU; 207 /* determine which version of MatSolve needs to be used. */ 208 PetscCall(ISIdentity(isrow, &row_identity)); 209 PetscCall(ISIdentity(iscol, &col_identity)); 210 211 if (!cusparsestruct->use_cpu_solve) { 212 if (row_identity && col_identity) { 213 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 214 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 215 } else { 216 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 217 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 218 } 219 } 220 B->ops->matsolve = NULL; 221 B->ops->matsolvetranspose = NULL; 222 223 /* get the triangular factors */ 224 if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B)); 225 PetscFunctionReturn(0); 226 } 227 228 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject) 229 { 230 MatCUSPARSEStorageFormat format; 231 PetscBool flg; 232 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 233 234 PetscFunctionBegin; 235 PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options"); 236 if (A->factortype == MAT_FACTOR_NONE) { 237 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 238 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format)); 239 240 PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 241 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format)); 242 PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg)); 243 if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve)); 244 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 245 PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg)); 246 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 247 #if CUSPARSE_VERSION > 11301 248 PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 249 #else 250 PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 251 #endif 252 PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg)); 253 PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 254 255 PetscCall( 256 PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg)); 257 PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 258 #endif 259 } 260 PetscOptionsHeadEnd(); 261 PetscFunctionReturn(0); 262 } 263 264 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 265 { 266 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 267 PetscInt n = A->rmap->n; 268 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 269 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 270 const PetscInt *ai = a->i, *aj = a->j, *vi; 271 const MatScalar *aa = a->a, *v; 272 PetscInt *AiLo, *AjLo; 273 PetscInt i, nz, nzLower, offset, rowOffset; 274 275 PetscFunctionBegin; 276 if (!n) PetscFunctionReturn(0); 277 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 278 try { 279 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 280 nzLower = n + ai[n] - ai[1]; 281 if (!loTriFactor) { 282 PetscScalar *AALo; 283 284 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar))); 285 286 /* Allocate Space for the lower triangular matrix */ 287 PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt))); 288 PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt))); 289 290 /* Fill the lower triangular matrix */ 291 AiLo[0] = (PetscInt)0; 292 AiLo[n] = nzLower; 293 AjLo[0] = (PetscInt)0; 294 AALo[0] = (MatScalar)1.0; 295 v = aa; 296 vi = aj; 297 offset = 1; 298 rowOffset = 1; 299 for (i = 1; i < n; i++) { 300 nz = ai[i + 1] - ai[i]; 301 /* additional 1 for the term on the diagonal */ 302 AiLo[i] = rowOffset; 303 rowOffset += nz + 1; 304 305 PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz)); 306 PetscCall(PetscArraycpy(&(AALo[offset]), v, nz)); 307 308 offset += nz; 309 AjLo[offset] = (PetscInt)i; 310 AALo[offset] = (MatScalar)1.0; 311 offset += 1; 312 313 v += nz; 314 vi += nz; 315 } 316 317 /* allocate space for the triangular factor information */ 318 PetscCall(PetscNew(&loTriFactor)); 319 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 320 /* Create the matrix description */ 321 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 322 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 323 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 324 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 325 #else 326 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 327 #endif 328 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER)); 329 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 330 331 /* set the operation */ 332 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 333 334 /* set the matrix */ 335 loTriFactor->csrMat = new CsrMatrix; 336 loTriFactor->csrMat->num_rows = n; 337 loTriFactor->csrMat->num_cols = n; 338 loTriFactor->csrMat->num_entries = nzLower; 339 340 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 341 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1); 342 343 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 344 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower); 345 346 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 347 loTriFactor->csrMat->values->assign(AALo, AALo + nzLower); 348 349 /* Create the solve analysis information */ 350 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 351 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 352 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 353 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 354 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 355 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 356 #endif 357 358 /* perform the solve analysis */ 359 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 360 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), 361 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 362 loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 363 #else 364 loTriFactor->solveInfo)); 365 #endif 366 PetscCallCUDA(WaitForCUDA()); 367 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 368 369 /* assign the pointer */ 370 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 371 loTriFactor->AA_h = AALo; 372 PetscCallCUDA(cudaFreeHost(AiLo)); 373 PetscCallCUDA(cudaFreeHost(AjLo)); 374 PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar))); 375 } else { /* update values only */ 376 if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar))); 377 /* Fill the lower triangular matrix */ 378 loTriFactor->AA_h[0] = 1.0; 379 v = aa; 380 vi = aj; 381 offset = 1; 382 for (i = 1; i < n; i++) { 383 nz = ai[i + 1] - ai[i]; 384 PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz)); 385 offset += nz; 386 loTriFactor->AA_h[offset] = 1.0; 387 offset += 1; 388 v += nz; 389 } 390 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower); 391 PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar))); 392 } 393 } catch (char *ex) { 394 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 395 } 396 } 397 PetscFunctionReturn(0); 398 } 399 400 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 401 { 402 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 403 PetscInt n = A->rmap->n; 404 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 405 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 406 const PetscInt *aj = a->j, *adiag = a->diag, *vi; 407 const MatScalar *aa = a->a, *v; 408 PetscInt *AiUp, *AjUp; 409 PetscInt i, nz, nzUpper, offset; 410 411 PetscFunctionBegin; 412 if (!n) PetscFunctionReturn(0); 413 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 414 try { 415 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 416 nzUpper = adiag[0] - adiag[n]; 417 if (!upTriFactor) { 418 PetscScalar *AAUp; 419 420 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 421 422 /* Allocate Space for the upper triangular matrix */ 423 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 424 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 425 426 /* Fill the upper triangular matrix */ 427 AiUp[0] = (PetscInt)0; 428 AiUp[n] = nzUpper; 429 offset = nzUpper; 430 for (i = n - 1; i >= 0; i--) { 431 v = aa + adiag[i + 1] + 1; 432 vi = aj + adiag[i + 1] + 1; 433 434 /* number of elements NOT on the diagonal */ 435 nz = adiag[i] - adiag[i + 1] - 1; 436 437 /* decrement the offset */ 438 offset -= (nz + 1); 439 440 /* first, set the diagonal elements */ 441 AjUp[offset] = (PetscInt)i; 442 AAUp[offset] = (MatScalar)1. / v[nz]; 443 AiUp[i] = AiUp[i + 1] - (nz + 1); 444 445 PetscCall(PetscArraycpy(&(AjUp[offset + 1]), vi, nz)); 446 PetscCall(PetscArraycpy(&(AAUp[offset + 1]), v, nz)); 447 } 448 449 /* allocate space for the triangular factor information */ 450 PetscCall(PetscNew(&upTriFactor)); 451 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 452 453 /* Create the matrix description */ 454 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 455 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 456 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 457 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 458 #else 459 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 460 #endif 461 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 462 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 463 464 /* set the operation */ 465 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 466 467 /* set the matrix */ 468 upTriFactor->csrMat = new CsrMatrix; 469 upTriFactor->csrMat->num_rows = n; 470 upTriFactor->csrMat->num_cols = n; 471 upTriFactor->csrMat->num_entries = nzUpper; 472 473 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 474 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1); 475 476 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 477 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper); 478 479 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 480 upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper); 481 482 /* Create the solve analysis information */ 483 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 484 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 485 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 486 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 487 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 488 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 489 #endif 490 491 /* perform the solve analysis */ 492 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 493 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), 494 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 495 upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 496 #else 497 upTriFactor->solveInfo)); 498 #endif 499 PetscCallCUDA(WaitForCUDA()); 500 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 501 502 /* assign the pointer */ 503 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 504 upTriFactor->AA_h = AAUp; 505 PetscCallCUDA(cudaFreeHost(AiUp)); 506 PetscCallCUDA(cudaFreeHost(AjUp)); 507 PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar))); 508 } else { 509 if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar))); 510 /* Fill the upper triangular matrix */ 511 offset = nzUpper; 512 for (i = n - 1; i >= 0; i--) { 513 v = aa + adiag[i + 1] + 1; 514 515 /* number of elements NOT on the diagonal */ 516 nz = adiag[i] - adiag[i + 1] - 1; 517 518 /* decrement the offset */ 519 offset -= (nz + 1); 520 521 /* first, set the diagonal elements */ 522 upTriFactor->AA_h[offset] = 1. / v[nz]; 523 PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset + 1]), v, nz)); 524 } 525 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper); 526 PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar))); 527 } 528 } catch (char *ex) { 529 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 530 } 531 } 532 PetscFunctionReturn(0); 533 } 534 535 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 536 { 537 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 538 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 539 IS isrow = a->row, iscol = a->icol; 540 PetscBool row_identity, col_identity; 541 PetscInt n = A->rmap->n; 542 543 PetscFunctionBegin; 544 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 545 PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A)); 546 PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A)); 547 548 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 549 cusparseTriFactors->nnz = a->nz; 550 551 A->offloadmask = PETSC_OFFLOAD_BOTH; 552 /* lower triangular indices */ 553 PetscCall(ISIdentity(isrow, &row_identity)); 554 if (!row_identity && !cusparseTriFactors->rpermIndices) { 555 const PetscInt *r; 556 557 PetscCall(ISGetIndices(isrow, &r)); 558 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 559 cusparseTriFactors->rpermIndices->assign(r, r + n); 560 PetscCall(ISRestoreIndices(isrow, &r)); 561 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 562 } 563 564 /* upper triangular indices */ 565 PetscCall(ISIdentity(iscol, &col_identity)); 566 if (!col_identity && !cusparseTriFactors->cpermIndices) { 567 const PetscInt *c; 568 569 PetscCall(ISGetIndices(iscol, &c)); 570 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 571 cusparseTriFactors->cpermIndices->assign(c, c + n); 572 PetscCall(ISRestoreIndices(iscol, &c)); 573 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 574 } 575 PetscFunctionReturn(0); 576 } 577 578 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 579 { 580 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 581 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 582 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 583 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 584 PetscInt *AiUp, *AjUp; 585 PetscScalar *AAUp; 586 PetscScalar *AALo; 587 PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j; 588 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data; 589 const PetscInt *ai = b->i, *aj = b->j, *vj; 590 const MatScalar *aa = b->a, *v; 591 592 PetscFunctionBegin; 593 if (!n) PetscFunctionReturn(0); 594 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 595 try { 596 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 597 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar))); 598 if (!upTriFactor && !loTriFactor) { 599 /* Allocate Space for the upper triangular matrix */ 600 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 601 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 602 603 /* Fill the upper triangular matrix */ 604 AiUp[0] = (PetscInt)0; 605 AiUp[n] = nzUpper; 606 offset = 0; 607 for (i = 0; i < n; i++) { 608 /* set the pointers */ 609 v = aa + ai[i]; 610 vj = aj + ai[i]; 611 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 612 613 /* first, set the diagonal elements */ 614 AjUp[offset] = (PetscInt)i; 615 AAUp[offset] = (MatScalar)1.0 / v[nz]; 616 AiUp[i] = offset; 617 AALo[offset] = (MatScalar)1.0 / v[nz]; 618 619 offset += 1; 620 if (nz > 0) { 621 PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz)); 622 PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 623 for (j = offset; j < offset + nz; j++) { 624 AAUp[j] = -AAUp[j]; 625 AALo[j] = AAUp[j] / v[nz]; 626 } 627 offset += nz; 628 } 629 } 630 631 /* allocate space for the triangular factor information */ 632 PetscCall(PetscNew(&upTriFactor)); 633 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 634 635 /* Create the matrix description */ 636 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 637 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 638 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 639 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 640 #else 641 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 642 #endif 643 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 644 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 645 646 /* set the matrix */ 647 upTriFactor->csrMat = new CsrMatrix; 648 upTriFactor->csrMat->num_rows = A->rmap->n; 649 upTriFactor->csrMat->num_cols = A->cmap->n; 650 upTriFactor->csrMat->num_entries = a->nz; 651 652 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 653 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 654 655 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 656 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 657 658 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 659 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 660 661 /* set the operation */ 662 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 663 664 /* Create the solve analysis information */ 665 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 666 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 667 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 668 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 669 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 670 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 671 #endif 672 673 /* perform the solve analysis */ 674 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 675 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), 676 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 677 upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 678 #else 679 upTriFactor->solveInfo)); 680 #endif 681 PetscCallCUDA(WaitForCUDA()); 682 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 683 684 /* assign the pointer */ 685 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 686 687 /* allocate space for the triangular factor information */ 688 PetscCall(PetscNew(&loTriFactor)); 689 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 690 691 /* Create the matrix description */ 692 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 693 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 694 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 695 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 696 #else 697 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 698 #endif 699 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 700 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 701 702 /* set the operation */ 703 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 704 705 /* set the matrix */ 706 loTriFactor->csrMat = new CsrMatrix; 707 loTriFactor->csrMat->num_rows = A->rmap->n; 708 loTriFactor->csrMat->num_cols = A->cmap->n; 709 loTriFactor->csrMat->num_entries = a->nz; 710 711 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 712 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 713 714 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 715 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 716 717 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 718 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 719 720 /* Create the solve analysis information */ 721 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 722 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 723 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 724 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 725 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 726 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 727 #endif 728 729 /* perform the solve analysis */ 730 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 731 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), 732 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 733 loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 734 #else 735 loTriFactor->solveInfo)); 736 #endif 737 PetscCallCUDA(WaitForCUDA()); 738 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 739 740 /* assign the pointer */ 741 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 742 743 PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar)))); 744 PetscCallCUDA(cudaFreeHost(AiUp)); 745 PetscCallCUDA(cudaFreeHost(AjUp)); 746 } else { 747 /* Fill the upper triangular matrix */ 748 offset = 0; 749 for (i = 0; i < n; i++) { 750 /* set the pointers */ 751 v = aa + ai[i]; 752 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 753 754 /* first, set the diagonal elements */ 755 AAUp[offset] = 1.0 / v[nz]; 756 AALo[offset] = 1.0 / v[nz]; 757 758 offset += 1; 759 if (nz > 0) { 760 PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 761 for (j = offset; j < offset + nz; j++) { 762 AAUp[j] = -AAUp[j]; 763 AALo[j] = AAUp[j] / v[nz]; 764 } 765 offset += nz; 766 } 767 } 768 PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 769 PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 770 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 771 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 772 PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar))); 773 } 774 PetscCallCUDA(cudaFreeHost(AAUp)); 775 PetscCallCUDA(cudaFreeHost(AALo)); 776 } catch (char *ex) { 777 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 778 } 779 } 780 PetscFunctionReturn(0); 781 } 782 783 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 784 { 785 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 786 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 787 IS ip = a->row; 788 PetscBool perm_identity; 789 PetscInt n = A->rmap->n; 790 791 PetscFunctionBegin; 792 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 793 PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A)); 794 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 795 cusparseTriFactors->nnz = (a->nz - n) * 2 + n; 796 797 A->offloadmask = PETSC_OFFLOAD_BOTH; 798 799 /* lower triangular indices */ 800 PetscCall(ISIdentity(ip, &perm_identity)); 801 if (!perm_identity) { 802 IS iip; 803 const PetscInt *irip, *rip; 804 805 PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip)); 806 PetscCall(ISGetIndices(iip, &irip)); 807 PetscCall(ISGetIndices(ip, &rip)); 808 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 809 cusparseTriFactors->rpermIndices->assign(rip, rip + n); 810 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 811 cusparseTriFactors->cpermIndices->assign(irip, irip + n); 812 PetscCall(ISRestoreIndices(iip, &irip)); 813 PetscCall(ISDestroy(&iip)); 814 PetscCall(ISRestoreIndices(ip, &rip)); 815 PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt))); 816 } 817 PetscFunctionReturn(0); 818 } 819 820 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 821 { 822 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 823 IS ip = b->row; 824 PetscBool perm_identity; 825 826 PetscFunctionBegin; 827 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 828 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 829 B->offloadmask = PETSC_OFFLOAD_CPU; 830 /* determine which version of MatSolve needs to be used. */ 831 PetscCall(ISIdentity(ip, &perm_identity)); 832 if (perm_identity) { 833 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 834 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 835 B->ops->matsolve = NULL; 836 B->ops->matsolvetranspose = NULL; 837 } else { 838 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 839 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 840 B->ops->matsolve = NULL; 841 B->ops->matsolvetranspose = NULL; 842 } 843 844 /* get the triangular factors */ 845 PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B)); 846 PetscFunctionReturn(0); 847 } 848 849 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 850 { 851 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 852 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 853 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 854 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 855 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 856 cusparseIndexBase_t indexBase; 857 cusparseMatrixType_t matrixType; 858 cusparseFillMode_t fillMode; 859 cusparseDiagType_t diagType; 860 861 PetscFunctionBegin; 862 /* allocate space for the transpose of the lower triangular factor */ 863 PetscCall(PetscNew(&loTriFactorT)); 864 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 865 866 /* set the matrix descriptors of the lower triangular factor */ 867 matrixType = cusparseGetMatType(loTriFactor->descr); 868 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 869 fillMode = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 870 diagType = cusparseGetMatDiagType(loTriFactor->descr); 871 872 /* Create the matrix description */ 873 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr)); 874 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 875 PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType)); 876 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode)); 877 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType)); 878 879 /* set the operation */ 880 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 881 882 /* allocate GPU space for the CSC of the lower triangular factor*/ 883 loTriFactorT->csrMat = new CsrMatrix; 884 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 885 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 886 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 887 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1); 888 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 889 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 890 891 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 892 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 893 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), 894 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 895 loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 896 PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize)); 897 #endif 898 899 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 900 PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 901 loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), 902 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 903 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer)); 904 #else 905 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase)); 906 #endif 907 PetscCallCUDA(WaitForCUDA()); 908 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 909 910 /* Create the solve analysis information */ 911 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 912 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo)); 913 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 914 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 915 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize)); 916 PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize)); 917 #endif 918 919 /* perform the solve analysis */ 920 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 921 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 922 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 923 loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 924 #else 925 loTriFactorT->solveInfo)); 926 #endif 927 PetscCallCUDA(WaitForCUDA()); 928 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 929 930 /* assign the pointer */ 931 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 932 933 /*********************************************/ 934 /* Now the Transpose of the Upper Tri Factor */ 935 /*********************************************/ 936 937 /* allocate space for the transpose of the upper triangular factor */ 938 PetscCall(PetscNew(&upTriFactorT)); 939 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 940 941 /* set the matrix descriptors of the upper triangular factor */ 942 matrixType = cusparseGetMatType(upTriFactor->descr); 943 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 944 fillMode = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 945 diagType = cusparseGetMatDiagType(upTriFactor->descr); 946 947 /* Create the matrix description */ 948 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr)); 949 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 950 PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType)); 951 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode)); 952 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType)); 953 954 /* set the operation */ 955 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 956 957 /* allocate GPU space for the CSC of the upper triangular factor*/ 958 upTriFactorT->csrMat = new CsrMatrix; 959 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 960 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 961 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 962 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1); 963 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 964 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 965 966 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 967 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 968 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), 969 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 970 upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 971 PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize)); 972 #endif 973 974 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 975 PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 976 upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), 977 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 978 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer)); 979 #else 980 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase)); 981 #endif 982 983 PetscCallCUDA(WaitForCUDA()); 984 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 985 986 /* Create the solve analysis information */ 987 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 988 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo)); 989 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 990 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 991 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize)); 992 PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize)); 993 #endif 994 995 /* perform the solve analysis */ 996 /* christ, would it have killed you to put this stuff in a function????????? */ 997 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 998 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 999 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1000 upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1001 #else 1002 upTriFactorT->solveInfo)); 1003 #endif 1004 1005 PetscCallCUDA(WaitForCUDA()); 1006 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1007 1008 /* assign the pointer */ 1009 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1010 PetscFunctionReturn(0); 1011 } 1012 1013 struct PetscScalarToPetscInt { 1014 __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); } 1015 }; 1016 1017 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1018 { 1019 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 1020 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1021 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1022 cusparseStatus_t stat; 1023 cusparseIndexBase_t indexBase; 1024 1025 PetscFunctionBegin; 1026 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1027 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 1028 PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct"); 1029 matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 1030 PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct"); 1031 if (A->transupdated) PetscFunctionReturn(0); 1032 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1033 PetscCall(PetscLogGpuTimeBegin()); 1034 if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 1035 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1036 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1037 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr)); 1038 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1039 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase)); 1040 PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1041 1042 /* set alpha and beta */ 1043 PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar))); 1044 PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero), sizeof(PetscScalar))); 1045 PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar))); 1046 PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1047 PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1048 PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1049 1050 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1051 CsrMatrix *matrixT = new CsrMatrix; 1052 matstructT->mat = matrixT; 1053 matrixT->num_rows = A->cmap->n; 1054 matrixT->num_cols = A->rmap->n; 1055 matrixT->num_entries = a->nz; 1056 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1); 1057 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1058 matrixT->values = new THRUSTARRAY(a->nz); 1059 1060 if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1061 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1062 1063 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1064 #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1) 1065 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1066 indexBase, cusparse_scalartype); 1067 PetscCallCUSPARSE(stat); 1068 #else 1069 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 1070 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 1071 1072 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 1073 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 1074 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 1075 */ 1076 if (matrixT->num_entries) { 1077 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype); 1078 PetscCallCUSPARSE(stat); 1079 1080 } else { 1081 matstructT->matDescr = NULL; 1082 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 1083 } 1084 #endif 1085 #endif 1086 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1087 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1088 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1089 #else 1090 CsrMatrix *temp = new CsrMatrix; 1091 CsrMatrix *tempT = new CsrMatrix; 1092 /* First convert HYB to CSR */ 1093 temp->num_rows = A->rmap->n; 1094 temp->num_cols = A->cmap->n; 1095 temp->num_entries = a->nz; 1096 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1097 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1098 temp->values = new THRUSTARRAY(a->nz); 1099 1100 stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get()); 1101 PetscCallCUSPARSE(stat); 1102 1103 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1104 tempT->num_rows = A->rmap->n; 1105 tempT->num_cols = A->cmap->n; 1106 tempT->num_entries = a->nz; 1107 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1108 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1109 tempT->values = new THRUSTARRAY(a->nz); 1110 1111 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(), 1112 tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1113 PetscCallCUSPARSE(stat); 1114 1115 /* Last, convert CSC to HYB */ 1116 cusparseHybMat_t hybMat; 1117 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 1118 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1119 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition); 1120 PetscCallCUSPARSE(stat); 1121 1122 /* assign the pointer */ 1123 matstructT->mat = hybMat; 1124 A->transupdated = PETSC_TRUE; 1125 /* delete temporaries */ 1126 if (tempT) { 1127 if (tempT->values) delete (THRUSTARRAY *)tempT->values; 1128 if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices; 1129 if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets; 1130 delete (CsrMatrix *)tempT; 1131 } 1132 if (temp) { 1133 if (temp->values) delete (THRUSTARRAY *)temp->values; 1134 if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices; 1135 if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets; 1136 delete (CsrMatrix *)temp; 1137 } 1138 #endif 1139 } 1140 } 1141 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1142 CsrMatrix *matrix = (CsrMatrix *)matstruct->mat; 1143 CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat; 1144 PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix"); 1145 PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows"); 1146 PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols"); 1147 PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values"); 1148 PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT"); 1149 PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows"); 1150 PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols"); 1151 PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values"); 1152 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1153 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1154 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1155 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 1156 } 1157 if (!cusparsestruct->csr2csc_i) { 1158 THRUSTARRAY csr2csc_a(matrix->num_entries); 1159 PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1160 1161 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1162 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1163 void *csr2cscBuffer; 1164 size_t csr2cscBufferSize; 1165 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1166 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize); 1167 PetscCallCUSPARSE(stat); 1168 PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize)); 1169 #endif 1170 1171 if (matrix->num_entries) { 1172 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1173 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1174 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1175 1176 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1177 should be filled with indexBase. So I just take a shortcut here. 1178 */ 1179 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1180 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1181 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer); 1182 PetscCallCUSPARSE(stat); 1183 #else 1184 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1185 PetscCallCUSPARSE(stat); 1186 #endif 1187 } else { 1188 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 1189 } 1190 1191 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1192 PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt())); 1193 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1194 PetscCallCUDA(cudaFree(csr2cscBuffer)); 1195 #endif 1196 } 1197 PetscCallThrust( 1198 thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin())); 1199 } 1200 PetscCall(PetscLogGpuTimeEnd()); 1201 PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1202 /* the compressed row indices is not used for matTranspose */ 1203 matstructT->cprowIndices = NULL; 1204 /* assign the pointer */ 1205 ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT; 1206 A->transupdated = PETSC_TRUE; 1207 PetscFunctionReturn(0); 1208 } 1209 1210 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1211 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1212 { 1213 PetscInt n = xx->map->n; 1214 const PetscScalar *barray; 1215 PetscScalar *xarray; 1216 thrust::device_ptr<const PetscScalar> bGPU; 1217 thrust::device_ptr<PetscScalar> xGPU; 1218 cusparseStatus_t stat; 1219 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1220 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1221 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1222 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1223 1224 PetscFunctionBegin; 1225 /* Analyze the matrix and create the transpose ... on the fly */ 1226 if (!loTriFactorT && !upTriFactorT) { 1227 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1228 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1229 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1230 } 1231 1232 /* Get the GPU pointers */ 1233 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1234 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1235 xGPU = thrust::device_pointer_cast(xarray); 1236 bGPU = thrust::device_pointer_cast(barray); 1237 1238 PetscCall(PetscLogGpuTimeBegin()); 1239 /* First, reorder with the row permutation */ 1240 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU); 1241 1242 /* First, solve U */ 1243 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, 1244 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1245 upTriFactorT->csrMat->num_entries, 1246 #endif 1247 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, 1248 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1249 tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer); 1250 PetscCallCUSPARSE(stat); 1251 #else 1252 tempGPU->data().get()); 1253 PetscCallCUSPARSE(stat); 1254 #endif 1255 1256 /* Then, solve L */ 1257 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, 1258 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1259 loTriFactorT->csrMat->num_entries, 1260 #endif 1261 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1262 tempGPU->data().get(), 1263 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1264 xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer); 1265 PetscCallCUSPARSE(stat); 1266 #else 1267 xarray); 1268 PetscCallCUSPARSE(stat); 1269 #endif 1270 1271 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1272 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin()); 1273 1274 /* Copy the temporary to the full solution. */ 1275 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU); 1276 1277 /* restore */ 1278 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1279 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1280 PetscCall(PetscLogGpuTimeEnd()); 1281 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1282 PetscFunctionReturn(0); 1283 } 1284 1285 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1286 { 1287 const PetscScalar *barray; 1288 PetscScalar *xarray; 1289 cusparseStatus_t stat; 1290 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1291 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1292 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1293 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1294 1295 PetscFunctionBegin; 1296 /* Analyze the matrix and create the transpose ... on the fly */ 1297 if (!loTriFactorT && !upTriFactorT) { 1298 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1299 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1300 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1301 } 1302 1303 /* Get the GPU pointers */ 1304 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1305 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1306 1307 PetscCall(PetscLogGpuTimeBegin()); 1308 /* First, solve U */ 1309 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, 1310 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1311 upTriFactorT->csrMat->num_entries, 1312 #endif 1313 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, 1314 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1315 tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer); 1316 PetscCallCUSPARSE(stat); 1317 #else 1318 tempGPU->data().get()); 1319 PetscCallCUSPARSE(stat); 1320 #endif 1321 1322 /* Then, solve L */ 1323 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, 1324 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1325 loTriFactorT->csrMat->num_entries, 1326 #endif 1327 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1328 tempGPU->data().get(), 1329 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1330 xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer); 1331 PetscCallCUSPARSE(stat); 1332 #else 1333 xarray); 1334 PetscCallCUSPARSE(stat); 1335 #endif 1336 1337 /* restore */ 1338 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1339 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1340 PetscCall(PetscLogGpuTimeEnd()); 1341 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1342 PetscFunctionReturn(0); 1343 } 1344 1345 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1346 { 1347 const PetscScalar *barray; 1348 PetscScalar *xarray; 1349 thrust::device_ptr<const PetscScalar> bGPU; 1350 thrust::device_ptr<PetscScalar> xGPU; 1351 cusparseStatus_t stat; 1352 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1353 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1354 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1355 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1356 1357 PetscFunctionBegin; 1358 1359 /* Get the GPU pointers */ 1360 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1361 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1362 xGPU = thrust::device_pointer_cast(xarray); 1363 bGPU = thrust::device_pointer_cast(barray); 1364 1365 PetscCall(PetscLogGpuTimeBegin()); 1366 /* First, reorder with the row permutation */ 1367 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin()); 1368 1369 /* Next, solve L */ 1370 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, 1371 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1372 loTriFactor->csrMat->num_entries, 1373 #endif 1374 &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 1375 tempGPU->data().get(), 1376 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1377 xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer); 1378 PetscCallCUSPARSE(stat); 1379 #else 1380 xarray); 1381 PetscCallCUSPARSE(stat); 1382 #endif 1383 1384 /* Then, solve U */ 1385 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, 1386 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1387 upTriFactor->csrMat->num_entries, 1388 #endif 1389 &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, 1390 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1391 tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer); 1392 PetscCallCUSPARSE(stat); 1393 #else 1394 tempGPU->data().get()); 1395 PetscCallCUSPARSE(stat); 1396 #endif 1397 1398 /* Last, reorder with the column permutation */ 1399 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU); 1400 1401 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1402 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1403 PetscCall(PetscLogGpuTimeEnd()); 1404 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1405 PetscFunctionReturn(0); 1406 } 1407 1408 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1409 { 1410 const PetscScalar *barray; 1411 PetscScalar *xarray; 1412 cusparseStatus_t stat; 1413 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1414 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1415 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1416 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1417 1418 PetscFunctionBegin; 1419 /* Get the GPU pointers */ 1420 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1421 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1422 1423 PetscCall(PetscLogGpuTimeBegin()); 1424 /* First, solve L */ 1425 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, 1426 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1427 loTriFactor->csrMat->num_entries, 1428 #endif 1429 &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, 1430 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1431 tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer); 1432 PetscCallCUSPARSE(stat); 1433 #else 1434 tempGPU->data().get()); 1435 PetscCallCUSPARSE(stat); 1436 #endif 1437 1438 /* Next, solve U */ 1439 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, 1440 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1441 upTriFactor->csrMat->num_entries, 1442 #endif 1443 &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 1444 tempGPU->data().get(), 1445 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1446 xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer); 1447 PetscCallCUSPARSE(stat); 1448 #else 1449 xarray); 1450 PetscCallCUSPARSE(stat); 1451 #endif 1452 1453 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1454 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1455 PetscCall(PetscLogGpuTimeEnd()); 1456 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1457 PetscFunctionReturn(0); 1458 } 1459 1460 #if CUSPARSE_VERSION >= 11500 1461 /* cusparseSpSV_solve() and friends first appeared in cusparse-11.3 */ 1462 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x) 1463 { 1464 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1465 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1466 const PetscScalar *barray; 1467 PetscScalar *xarray; 1468 1469 PetscFunctionBegin; 1470 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1471 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1472 PetscCall(PetscLogGpuTimeBegin()); 1473 1474 /* Solve L*y = b */ 1475 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1476 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1477 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */ 1478 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, 1479 fs->spsvDescr_L)); // cusparseSpSV_solve() scretely uses the external buffer used in cusparseSpSV_analysis()! 1480 1481 /* Solve U*x = y */ 1482 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1483 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */ 1484 fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U)); 1485 1486 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1487 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1488 1489 PetscCall(PetscLogGpuTimeEnd()); 1490 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1491 PetscFunctionReturn(0); 1492 } 1493 1494 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x) 1495 { 1496 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1497 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1498 const PetscScalar *barray; 1499 PetscScalar *xarray; 1500 1501 PetscFunctionBegin; 1502 if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */ 1503 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 1504 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do tranpose solve with it */ 1505 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 1506 1507 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); 1508 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 1509 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 1510 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 1511 fs->createdTransposeSpSVDescr = PETSC_TRUE; 1512 } 1513 1514 if (!fs->updatedTransposeSpSVAnalysis) { 1515 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1516 1517 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 1518 fs->updatedTransposeSpSVAnalysis = PETSC_TRUE; 1519 } 1520 1521 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1522 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1523 PetscCall(PetscLogGpuTimeBegin()); 1524 1525 /* Solve Ut*y = b */ 1526 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1527 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1528 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */ 1529 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut)); 1530 1531 /* Solve Lt*x = y */ 1532 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1533 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */ 1534 fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt)); 1535 1536 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1537 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1538 PetscCall(PetscLogGpuTimeEnd()); 1539 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1540 PetscFunctionReturn(0); 1541 } 1542 1543 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *info) 1544 { 1545 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1546 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1547 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1548 CsrMatrix *Acsr; 1549 PetscInt m, nz; 1550 PetscBool flg; 1551 1552 PetscFunctionBegin; 1553 if (PetscDefined(USE_DEBUG)) { 1554 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1555 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1556 } 1557 1558 /* Copy A's value to fact */ 1559 m = fact->rmap->n; 1560 nz = aij->nz; 1561 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1562 Acsr = (CsrMatrix *)Acusp->mat->mat; 1563 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1564 1565 /* Factorize fact inplace */ 1566 if (m) 1567 PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1568 fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1569 if (PetscDefined(USE_DEBUG)) { 1570 int numerical_zero; 1571 cusparseStatus_t status; 1572 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero); 1573 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1574 } 1575 1576 /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02() 1577 See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78 1578 */ 1579 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1580 1581 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 1582 1583 /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */ 1584 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 1585 1586 fact->offloadmask = PETSC_OFFLOAD_GPU; 1587 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ILU0; 1588 fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_ILU0; 1589 fact->ops->matsolve = NULL; 1590 fact->ops->matsolvetranspose = NULL; 1591 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1592 PetscFunctionReturn(0); 1593 } 1594 1595 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1596 { 1597 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1598 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1599 PetscInt m, nz; 1600 1601 PetscFunctionBegin; 1602 if (PetscDefined(USE_DEBUG)) { 1603 PetscInt i; 1604 PetscBool flg, missing; 1605 1606 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1607 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1608 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 1609 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1610 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1611 } 1612 1613 /* Free the old stale stuff */ 1614 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 1615 1616 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1617 but they will not be used. Allocate them just for easy debugging. 1618 */ 1619 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 1620 1621 fact->offloadmask = PETSC_OFFLOAD_BOTH; 1622 fact->factortype = MAT_FACTOR_ILU; 1623 fact->info.factor_mallocs = 0; 1624 fact->info.fill_ratio_given = info->fill; 1625 fact->info.fill_ratio_needed = 1.0; 1626 1627 aij->row = NULL; 1628 aij->col = NULL; 1629 1630 /* ====================================================================== */ 1631 /* Copy A's i, j to fact and also allocate the value array of fact. */ 1632 /* We'll do in-place factorization on fact */ 1633 /* ====================================================================== */ 1634 const int *Ai, *Aj; 1635 1636 m = fact->rmap->n; 1637 nz = aij->nz; 1638 1639 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1))); 1640 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz)); 1641 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz)); 1642 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */ 1643 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1644 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1645 1646 /* ====================================================================== */ 1647 /* Create descriptors for M, L, U */ 1648 /* ====================================================================== */ 1649 cusparseFillMode_t fillMode; 1650 cusparseDiagType_t diagType; 1651 1652 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 1653 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 1654 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 1655 1656 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 1657 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1658 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1659 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1660 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1661 */ 1662 fillMode = CUSPARSE_FILL_MODE_LOWER; 1663 diagType = CUSPARSE_DIAG_TYPE_UNIT; 1664 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1665 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1666 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1667 1668 fillMode = CUSPARSE_FILL_MODE_UPPER; 1669 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 1670 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1671 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1672 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1673 1674 /* ========================================================================= */ 1675 /* Query buffer sizes for csrilu0, SpSV and allocate buffers */ 1676 /* ========================================================================= */ 1677 PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M)); 1678 if (m) 1679 PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1680 fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, &fs->factBufferSize_M)); 1681 1682 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 1683 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 1684 1685 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 1686 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 1687 1688 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 1689 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 1690 1691 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 1692 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 1693 1694 /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab, 1695 and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77, 1696 spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U. 1697 To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U. 1698 */ 1699 if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) { 1700 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 1701 fs->spsvBuffer_L = fs->factBuffer_M; 1702 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 1703 } else { 1704 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M))); 1705 fs->spsvBuffer_U = fs->factBuffer_M; 1706 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 1707 } 1708 1709 /* ========================================================================== */ 1710 /* Perform analysis of ilu0 on M, SpSv on L and U */ 1711 /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/ 1712 /* ========================================================================== */ 1713 int structural_zero; 1714 cusparseStatus_t status; 1715 1716 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1717 if (m) 1718 PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1719 fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1720 if (PetscDefined(USE_DEBUG)) { 1721 /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 1722 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero); 1723 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero); 1724 } 1725 1726 /* Estimate FLOPs of the numeric factorization */ 1727 { 1728 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 1729 PetscInt *Ai, *Adiag, nzRow, nzLeft; 1730 PetscLogDouble flops = 0.0; 1731 1732 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 1733 Ai = Aseq->i; 1734 Adiag = Aseq->diag; 1735 for (PetscInt i = 0; i < m; i++) { 1736 if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */ 1737 nzRow = Ai[i + 1] - Ai[i]; 1738 nzLeft = Adiag[i] - Ai[i]; 1739 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 1740 and include the eliminated one will be updated, which incurs a multiplication and an addition. 1741 */ 1742 nzLeft = (nzRow - 1) / 2; 1743 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 1744 } 1745 } 1746 fs->numericFactFlops = flops; 1747 } 1748 fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0; 1749 PetscFunctionReturn(0); 1750 } 1751 1752 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x) 1753 { 1754 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1755 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1756 const PetscScalar *barray; 1757 PetscScalar *xarray; 1758 1759 PetscFunctionBegin; 1760 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1761 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1762 PetscCall(PetscLogGpuTimeBegin()); 1763 1764 /* Solve L*y = b */ 1765 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1766 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1767 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */ 1768 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L)); 1769 1770 /* Solve Lt*x = y */ 1771 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1772 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */ 1773 fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt)); 1774 1775 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1776 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1777 1778 PetscCall(PetscLogGpuTimeEnd()); 1779 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1780 PetscFunctionReturn(0); 1781 } 1782 1783 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *info) 1784 { 1785 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1786 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1787 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1788 CsrMatrix *Acsr; 1789 PetscInt m, nz; 1790 PetscBool flg; 1791 1792 PetscFunctionBegin; 1793 if (PetscDefined(USE_DEBUG)) { 1794 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1795 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1796 } 1797 1798 /* Copy A's value to fact */ 1799 m = fact->rmap->n; 1800 nz = aij->nz; 1801 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1802 Acsr = (CsrMatrix *)Acusp->mat->mat; 1803 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1804 1805 /* Factorize fact inplace */ 1806 /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve 1807 Function csric02() only takes the lower triangular part of matrix A to perform factorization. 1808 The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored, 1809 and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not. 1810 In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided. 1811 */ 1812 if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 1813 if (PetscDefined(USE_DEBUG)) { 1814 int numerical_zero; 1815 cusparseStatus_t status; 1816 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero); 1817 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1818 } 1819 1820 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1821 1822 /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE 1823 ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F 1824 */ 1825 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1826 1827 fact->offloadmask = PETSC_OFFLOAD_GPU; 1828 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0; 1829 fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0; 1830 fact->ops->matsolve = NULL; 1831 fact->ops->matsolvetranspose = NULL; 1832 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1833 PetscFunctionReturn(0); 1834 } 1835 1836 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS perm, const MatFactorInfo *info) 1837 { 1838 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1839 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1840 PetscInt m, nz; 1841 1842 PetscFunctionBegin; 1843 if (PetscDefined(USE_DEBUG)) { 1844 PetscInt i; 1845 PetscBool flg, missing; 1846 1847 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1848 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1849 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 1850 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1851 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1852 } 1853 1854 /* Free the old stale stuff */ 1855 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 1856 1857 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1858 but they will not be used. Allocate them just for easy debugging. 1859 */ 1860 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 1861 1862 fact->offloadmask = PETSC_OFFLOAD_BOTH; 1863 fact->factortype = MAT_FACTOR_ICC; 1864 fact->info.factor_mallocs = 0; 1865 fact->info.fill_ratio_given = info->fill; 1866 fact->info.fill_ratio_needed = 1.0; 1867 1868 aij->row = NULL; 1869 aij->col = NULL; 1870 1871 /* ====================================================================== */ 1872 /* Copy A's i, j to fact and also allocate the value array of fact. */ 1873 /* We'll do in-place factorization on fact */ 1874 /* ====================================================================== */ 1875 const int *Ai, *Aj; 1876 1877 m = fact->rmap->n; 1878 nz = aij->nz; 1879 1880 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1))); 1881 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz)); 1882 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz)); 1883 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */ 1884 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1885 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1886 1887 /* ====================================================================== */ 1888 /* Create mat descriptors for M, L */ 1889 /* ====================================================================== */ 1890 cusparseFillMode_t fillMode; 1891 cusparseDiagType_t diagType; 1892 1893 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 1894 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 1895 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 1896 1897 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 1898 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1899 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1900 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1901 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1902 */ 1903 fillMode = CUSPARSE_FILL_MODE_LOWER; 1904 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 1905 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1906 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1907 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1908 1909 /* ========================================================================= */ 1910 /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */ 1911 /* ========================================================================= */ 1912 PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M)); 1913 if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, &fs->factBufferSize_M)); 1914 1915 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 1916 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 1917 1918 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 1919 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 1920 1921 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 1922 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 1923 1924 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 1925 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 1926 1927 /* To save device memory, we make the factorization buffer share with one of the solver buffer. 1928 See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(). 1929 */ 1930 if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) { 1931 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 1932 fs->spsvBuffer_L = fs->factBuffer_M; 1933 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 1934 } else { 1935 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M))); 1936 fs->spsvBuffer_Lt = fs->factBuffer_M; 1937 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 1938 } 1939 1940 /* ========================================================================== */ 1941 /* Perform analysis of ic0 on M */ 1942 /* The lower triangular part of M has the same sparsity pattern as L */ 1943 /* ========================================================================== */ 1944 int structural_zero; 1945 cusparseStatus_t status; 1946 1947 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1948 if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 1949 if (PetscDefined(USE_DEBUG)) { 1950 /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 1951 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero); 1952 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero); 1953 } 1954 1955 /* Estimate FLOPs of the numeric factorization */ 1956 { 1957 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 1958 PetscInt *Ai, nzRow, nzLeft; 1959 PetscLogDouble flops = 0.0; 1960 1961 Ai = Aseq->i; 1962 for (PetscInt i = 0; i < m; i++) { 1963 nzRow = Ai[i + 1] - Ai[i]; 1964 if (nzRow > 1) { 1965 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 1966 and include the eliminated one will be updated, which incurs a multiplication and an addition. 1967 */ 1968 nzLeft = (nzRow - 1) / 2; 1969 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 1970 } 1971 } 1972 fs->numericFactFlops = flops; 1973 } 1974 fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0; 1975 PetscFunctionReturn(0); 1976 } 1977 #endif 1978 1979 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1980 { 1981 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 1982 1983 PetscFunctionBegin; 1984 #if CUSPARSE_VERSION >= 11500 1985 PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 1986 if (cusparseTriFactors->factorizeOnDevice) { 1987 PetscCall(ISIdentity(isrow, &row_identity)); 1988 PetscCall(ISIdentity(iscol, &col_identity)); 1989 } 1990 if (!info->levels && row_identity && col_identity) { 1991 PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info)); 1992 } else 1993 #endif 1994 { 1995 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 1996 PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1997 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 1998 } 1999 PetscFunctionReturn(0); 2000 } 2001 2002 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2003 { 2004 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2005 2006 PetscFunctionBegin; 2007 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2008 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2009 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2010 PetscFunctionReturn(0); 2011 } 2012 2013 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2014 { 2015 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2016 2017 PetscFunctionBegin; 2018 #if CUSPARSE_VERSION >= 11500 2019 PetscBool perm_identity = PETSC_FALSE; 2020 if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity)); 2021 if (!info->levels && perm_identity) { 2022 PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info)); 2023 } else 2024 #endif 2025 { 2026 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2027 PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 2028 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2029 } 2030 PetscFunctionReturn(0); 2031 } 2032 2033 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2034 { 2035 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2036 2037 PetscFunctionBegin; 2038 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2039 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); 2040 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2041 PetscFunctionReturn(0); 2042 } 2043 2044 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A, MatSolverType *type) 2045 { 2046 PetscFunctionBegin; 2047 *type = MATSOLVERCUSPARSE; 2048 PetscFunctionReturn(0); 2049 } 2050 2051 /*MC 2052 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 2053 on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported 2054 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 2055 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 2056 CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 2057 algorithms are not recommended. This class does NOT support direct solver operations. 2058 2059 Level: beginner 2060 2061 .seealso: `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 2062 M*/ 2063 2064 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B) 2065 { 2066 PetscInt n = A->rmap->n; 2067 PetscBool factOnDevice, factOnHost; 2068 char *prefix; 2069 char factPlace[32] = "device"; /* the default */ 2070 2071 PetscFunctionBegin; 2072 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2073 PetscCall(MatSetSizes(*B, n, n, n, n)); 2074 (*B)->factortype = ftype; 2075 PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE)); 2076 2077 prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix; 2078 PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)), prefix, "MatGetFactor", "Mat"); 2079 PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL)); 2080 PetscOptionsEnd(); 2081 PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice)); 2082 PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost)); 2083 PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)(*B)), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace); 2084 ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice; 2085 2086 if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE)); 2087 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2088 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2089 if (!A->boundtocpu) { 2090 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 2091 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 2092 } else { 2093 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 2094 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 2095 } 2096 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 2097 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2098 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2099 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2100 if (!A->boundtocpu) { 2101 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 2102 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 2103 } else { 2104 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 2105 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 2106 } 2107 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2108 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2109 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types"); 2110 2111 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 2112 (*B)->canuseordering = PETSC_TRUE; 2113 PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse)); 2114 PetscFunctionReturn(0); 2115 } 2116 2117 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 2118 { 2119 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2120 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2121 #if CUSPARSE_VERSION >= 13500 2122 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 2123 #endif 2124 2125 PetscFunctionBegin; 2126 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 2127 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2128 if (A->factortype == MAT_FACTOR_NONE) { 2129 CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat; 2130 PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2131 } 2132 #if CUSPARSE_VERSION >= 13500 2133 else if (fs->csrVal) { 2134 /* We have a factorized matrix on device and are able to copy it to host */ 2135 PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2136 } 2137 #endif 2138 else 2139 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host"); 2140 PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar))); 2141 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2142 A->offloadmask = PETSC_OFFLOAD_BOTH; 2143 } 2144 PetscFunctionReturn(0); 2145 } 2146 2147 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2148 { 2149 PetscFunctionBegin; 2150 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2151 *array = ((Mat_SeqAIJ *)A->data)->a; 2152 PetscFunctionReturn(0); 2153 } 2154 2155 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2156 { 2157 PetscFunctionBegin; 2158 A->offloadmask = PETSC_OFFLOAD_CPU; 2159 *array = NULL; 2160 PetscFunctionReturn(0); 2161 } 2162 2163 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) 2164 { 2165 PetscFunctionBegin; 2166 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2167 *array = ((Mat_SeqAIJ *)A->data)->a; 2168 PetscFunctionReturn(0); 2169 } 2170 2171 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) 2172 { 2173 PetscFunctionBegin; 2174 *array = NULL; 2175 PetscFunctionReturn(0); 2176 } 2177 2178 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2179 { 2180 PetscFunctionBegin; 2181 *array = ((Mat_SeqAIJ *)A->data)->a; 2182 PetscFunctionReturn(0); 2183 } 2184 2185 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2186 { 2187 PetscFunctionBegin; 2188 A->offloadmask = PETSC_OFFLOAD_CPU; 2189 *array = NULL; 2190 PetscFunctionReturn(0); 2191 } 2192 2193 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 2194 { 2195 Mat_SeqAIJCUSPARSE *cusp; 2196 CsrMatrix *matrix; 2197 2198 PetscFunctionBegin; 2199 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2200 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 2201 cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 2202 PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL"); 2203 matrix = (CsrMatrix *)cusp->mat->mat; 2204 2205 if (i) { 2206 #if !defined(PETSC_USE_64BIT_INDICES) 2207 *i = matrix->row_offsets->data().get(); 2208 #else 2209 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 2210 #endif 2211 } 2212 if (j) { 2213 #if !defined(PETSC_USE_64BIT_INDICES) 2214 *j = matrix->column_indices->data().get(); 2215 #else 2216 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 2217 #endif 2218 } 2219 if (a) *a = matrix->values->data().get(); 2220 if (mtype) *mtype = PETSC_MEMTYPE_CUDA; 2221 PetscFunctionReturn(0); 2222 } 2223 2224 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 2225 { 2226 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 2227 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 2228 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2229 PetscInt m = A->rmap->n, *ii, *ridx, tmp; 2230 cusparseStatus_t stat; 2231 PetscBool both = PETSC_TRUE; 2232 2233 PetscFunctionBegin; 2234 PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU"); 2235 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 2236 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 2237 CsrMatrix *matrix; 2238 matrix = (CsrMatrix *)cusparsestruct->mat->mat; 2239 2240 PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values"); 2241 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2242 matrix->values->assign(a->a, a->a + a->nz); 2243 PetscCallCUDA(WaitForCUDA()); 2244 PetscCall(PetscLogCpuToGpu((a->nz) * sizeof(PetscScalar))); 2245 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2246 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 2247 } else { 2248 PetscInt nnz; 2249 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2250 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format)); 2251 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 2252 delete cusparsestruct->workVector; 2253 delete cusparsestruct->rowoffsets_gpu; 2254 cusparsestruct->workVector = NULL; 2255 cusparsestruct->rowoffsets_gpu = NULL; 2256 try { 2257 if (a->compressedrow.use) { 2258 m = a->compressedrow.nrows; 2259 ii = a->compressedrow.i; 2260 ridx = a->compressedrow.rindex; 2261 } else { 2262 m = A->rmap->n; 2263 ii = a->i; 2264 ridx = NULL; 2265 } 2266 PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data"); 2267 if (!a->a) { 2268 nnz = ii[m]; 2269 both = PETSC_FALSE; 2270 } else nnz = a->nz; 2271 PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data"); 2272 2273 /* create cusparse matrix */ 2274 cusparsestruct->nrows = m; 2275 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 2276 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr)); 2277 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO)); 2278 PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 2279 2280 PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar))); 2281 PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero), sizeof(PetscScalar))); 2282 PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar))); 2283 PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2284 PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2285 PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2286 PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2287 2288 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 2289 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2290 /* set the matrix */ 2291 CsrMatrix *mat = new CsrMatrix; 2292 mat->num_rows = m; 2293 mat->num_cols = A->cmap->n; 2294 mat->num_entries = nnz; 2295 mat->row_offsets = new THRUSTINTARRAY32(m + 1); 2296 mat->row_offsets->assign(ii, ii + m + 1); 2297 2298 mat->column_indices = new THRUSTINTARRAY32(nnz); 2299 mat->column_indices->assign(a->j, a->j + nnz); 2300 2301 mat->values = new THRUSTARRAY(nnz); 2302 if (a->a) mat->values->assign(a->a, a->a + nnz); 2303 2304 /* assign the pointer */ 2305 matstruct->mat = mat; 2306 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2307 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 2308 stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2309 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2310 PetscCallCUSPARSE(stat); 2311 } 2312 #endif 2313 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 2314 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2315 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2316 #else 2317 CsrMatrix *mat = new CsrMatrix; 2318 mat->num_rows = m; 2319 mat->num_cols = A->cmap->n; 2320 mat->num_entries = nnz; 2321 mat->row_offsets = new THRUSTINTARRAY32(m + 1); 2322 mat->row_offsets->assign(ii, ii + m + 1); 2323 2324 mat->column_indices = new THRUSTINTARRAY32(nnz); 2325 mat->column_indices->assign(a->j, a->j + nnz); 2326 2327 mat->values = new THRUSTARRAY(nnz); 2328 if (a->a) mat->values->assign(a->a, a->a + nnz); 2329 2330 cusparseHybMat_t hybMat; 2331 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 2332 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 2333 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition); 2334 PetscCallCUSPARSE(stat); 2335 /* assign the pointer */ 2336 matstruct->mat = hybMat; 2337 2338 if (mat) { 2339 if (mat->values) delete (THRUSTARRAY *)mat->values; 2340 if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices; 2341 if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets; 2342 delete (CsrMatrix *)mat; 2343 } 2344 #endif 2345 } 2346 2347 /* assign the compressed row indices */ 2348 if (a->compressedrow.use) { 2349 cusparsestruct->workVector = new THRUSTARRAY(m); 2350 matstruct->cprowIndices = new THRUSTINTARRAY(m); 2351 matstruct->cprowIndices->assign(ridx, ridx + m); 2352 tmp = m; 2353 } else { 2354 cusparsestruct->workVector = NULL; 2355 matstruct->cprowIndices = NULL; 2356 tmp = 0; 2357 } 2358 PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar))); 2359 2360 /* assign the pointer */ 2361 cusparsestruct->mat = matstruct; 2362 } catch (char *ex) { 2363 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 2364 } 2365 PetscCallCUDA(WaitForCUDA()); 2366 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2367 cusparsestruct->nonzerostate = A->nonzerostate; 2368 } 2369 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 2370 } 2371 PetscFunctionReturn(0); 2372 } 2373 2374 struct VecCUDAPlusEquals { 2375 template <typename Tuple> 2376 __host__ __device__ void operator()(Tuple t) 2377 { 2378 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 2379 } 2380 }; 2381 2382 struct VecCUDAEquals { 2383 template <typename Tuple> 2384 __host__ __device__ void operator()(Tuple t) 2385 { 2386 thrust::get<1>(t) = thrust::get<0>(t); 2387 } 2388 }; 2389 2390 struct VecCUDAEqualsReverse { 2391 template <typename Tuple> 2392 __host__ __device__ void operator()(Tuple t) 2393 { 2394 thrust::get<0>(t) = thrust::get<1>(t); 2395 } 2396 }; 2397 2398 struct MatMatCusparse { 2399 PetscBool cisdense; 2400 PetscScalar *Bt; 2401 Mat X; 2402 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2403 PetscLogDouble flops; 2404 CsrMatrix *Bcsr; 2405 2406 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2407 cusparseSpMatDescr_t matSpBDescr; 2408 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2409 cusparseDnMatDescr_t matBDescr; 2410 cusparseDnMatDescr_t matCDescr; 2411 PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/ 2412 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2413 void *dBuffer4; 2414 void *dBuffer5; 2415 #endif 2416 size_t mmBufferSize; 2417 void *mmBuffer; 2418 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2419 cusparseSpGEMMDescr_t spgemmDesc; 2420 #endif 2421 }; 2422 2423 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2424 { 2425 MatMatCusparse *mmdata = (MatMatCusparse *)data; 2426 2427 PetscFunctionBegin; 2428 PetscCallCUDA(cudaFree(mmdata->Bt)); 2429 delete mmdata->Bcsr; 2430 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2431 if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr)); 2432 if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2433 if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2434 if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2435 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2436 if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4)); 2437 if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5)); 2438 #endif 2439 if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2440 if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2)); 2441 #endif 2442 PetscCall(MatDestroy(&mmdata->X)); 2443 PetscCall(PetscFree(data)); 2444 PetscFunctionReturn(0); 2445 } 2446 2447 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat, Mat, Mat, PetscBool, PetscBool); 2448 2449 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2450 { 2451 Mat_Product *product = C->product; 2452 Mat A, B; 2453 PetscInt m, n, blda, clda; 2454 PetscBool flg, biscuda; 2455 Mat_SeqAIJCUSPARSE *cusp; 2456 cusparseStatus_t stat; 2457 cusparseOperation_t opA; 2458 const PetscScalar *barray; 2459 PetscScalar *carray; 2460 MatMatCusparse *mmdata; 2461 Mat_SeqAIJCUSPARSEMultStruct *mat; 2462 CsrMatrix *csrmat; 2463 2464 PetscFunctionBegin; 2465 MatCheckProduct(C, 1); 2466 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2467 mmdata = (MatMatCusparse *)product->data; 2468 A = product->A; 2469 B = product->B; 2470 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2471 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2472 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2473 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2474 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2475 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2476 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2477 switch (product->type) { 2478 case MATPRODUCT_AB: 2479 case MATPRODUCT_PtAP: 2480 mat = cusp->mat; 2481 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2482 m = A->rmap->n; 2483 n = B->cmap->n; 2484 break; 2485 case MATPRODUCT_AtB: 2486 if (!A->form_explicit_transpose) { 2487 mat = cusp->mat; 2488 opA = CUSPARSE_OPERATION_TRANSPOSE; 2489 } else { 2490 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2491 mat = cusp->matTranspose; 2492 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2493 } 2494 m = A->cmap->n; 2495 n = B->cmap->n; 2496 break; 2497 case MATPRODUCT_ABt: 2498 case MATPRODUCT_RARt: 2499 mat = cusp->mat; 2500 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2501 m = A->rmap->n; 2502 n = B->rmap->n; 2503 break; 2504 default: 2505 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2506 } 2507 PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 2508 csrmat = (CsrMatrix *)mat->mat; 2509 /* if the user passed a CPU matrix, copy the data to the GPU */ 2510 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda)); 2511 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B)); 2512 PetscCall(MatDenseCUDAGetArrayRead(B, &barray)); 2513 2514 PetscCall(MatDenseGetLDA(B, &blda)); 2515 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2516 PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X, &carray)); 2517 PetscCall(MatDenseGetLDA(mmdata->X, &clda)); 2518 } else { 2519 PetscCall(MatDenseCUDAGetArrayWrite(C, &carray)); 2520 PetscCall(MatDenseGetLDA(C, &clda)); 2521 } 2522 2523 PetscCall(PetscLogGpuTimeBegin()); 2524 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2525 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2526 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2527 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2528 size_t mmBufferSize; 2529 if (mmdata->initialized && mmdata->Blda != blda) { 2530 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2531 mmdata->matBDescr = NULL; 2532 } 2533 if (!mmdata->matBDescr) { 2534 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2535 mmdata->Blda = blda; 2536 } 2537 2538 if (mmdata->initialized && mmdata->Clda != clda) { 2539 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2540 mmdata->matCDescr = NULL; 2541 } 2542 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2543 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2544 mmdata->Clda = clda; 2545 } 2546 2547 if (!mat->matDescr) { 2548 stat = cusparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2549 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2550 PetscCallCUSPARSE(stat); 2551 } 2552 stat = cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize); 2553 PetscCallCUSPARSE(stat); 2554 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2555 PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2556 PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize)); 2557 mmdata->mmBufferSize = mmBufferSize; 2558 } 2559 mmdata->initialized = PETSC_TRUE; 2560 } else { 2561 /* to be safe, always update pointers of the mats */ 2562 PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr, csrmat->values->data().get())); 2563 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray)); 2564 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray)); 2565 } 2566 2567 /* do cusparseSpMM, which supports transpose on B */ 2568 stat = cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer); 2569 PetscCallCUSPARSE(stat); 2570 #else 2571 PetscInt k; 2572 /* cusparseXcsrmm does not support transpose on B */ 2573 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2574 cublasHandle_t cublasv2handle; 2575 cublasStatus_t cerr; 2576 2577 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 2578 cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n); 2579 PetscCallCUBLAS(cerr); 2580 blda = B->cmap->n; 2581 k = B->cmap->n; 2582 } else { 2583 k = B->rmap->n; 2584 } 2585 2586 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2587 stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda); 2588 PetscCallCUSPARSE(stat); 2589 #endif 2590 PetscCall(PetscLogGpuTimeEnd()); 2591 PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries)); 2592 PetscCall(MatDenseCUDARestoreArrayRead(B, &barray)); 2593 if (product->type == MATPRODUCT_RARt) { 2594 PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X, &carray)); 2595 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE)); 2596 } else if (product->type == MATPRODUCT_PtAP) { 2597 PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X, &carray)); 2598 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE)); 2599 } else { 2600 PetscCall(MatDenseCUDARestoreArrayWrite(C, &carray)); 2601 } 2602 if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C)); 2603 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B)); 2604 PetscFunctionReturn(0); 2605 } 2606 2607 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2608 { 2609 Mat_Product *product = C->product; 2610 Mat A, B; 2611 PetscInt m, n; 2612 PetscBool cisdense, flg; 2613 MatMatCusparse *mmdata; 2614 Mat_SeqAIJCUSPARSE *cusp; 2615 2616 PetscFunctionBegin; 2617 MatCheckProduct(C, 1); 2618 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 2619 A = product->A; 2620 B = product->B; 2621 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2622 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2623 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2624 PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2625 switch (product->type) { 2626 case MATPRODUCT_AB: 2627 m = A->rmap->n; 2628 n = B->cmap->n; 2629 break; 2630 case MATPRODUCT_AtB: 2631 m = A->cmap->n; 2632 n = B->cmap->n; 2633 break; 2634 case MATPRODUCT_ABt: 2635 m = A->rmap->n; 2636 n = B->rmap->n; 2637 break; 2638 case MATPRODUCT_PtAP: 2639 m = B->cmap->n; 2640 n = B->cmap->n; 2641 break; 2642 case MATPRODUCT_RARt: 2643 m = B->rmap->n; 2644 n = B->rmap->n; 2645 break; 2646 default: 2647 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2648 } 2649 PetscCall(MatSetSizes(C, m, n, m, n)); 2650 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2651 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense)); 2652 PetscCall(MatSetType(C, MATSEQDENSECUDA)); 2653 2654 /* product data */ 2655 PetscCall(PetscNew(&mmdata)); 2656 mmdata->cisdense = cisdense; 2657 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0) 2658 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2659 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar))); 2660 #endif 2661 /* for these products we need intermediate storage */ 2662 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2663 PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X)); 2664 PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA)); 2665 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2666 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n)); 2667 } else { 2668 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n)); 2669 } 2670 } 2671 C->product->data = mmdata; 2672 C->product->destroy = MatDestroy_MatMatCusparse; 2673 2674 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2675 PetscFunctionReturn(0); 2676 } 2677 2678 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2679 { 2680 Mat_Product *product = C->product; 2681 Mat A, B; 2682 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 2683 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 2684 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 2685 CsrMatrix *Acsr, *Bcsr, *Ccsr; 2686 PetscBool flg; 2687 cusparseStatus_t stat; 2688 MatProductType ptype; 2689 MatMatCusparse *mmdata; 2690 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2691 cusparseSpMatDescr_t BmatSpDescr; 2692 #endif 2693 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2694 2695 PetscFunctionBegin; 2696 MatCheckProduct(C, 1); 2697 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2698 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg)); 2699 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name); 2700 mmdata = (MatMatCusparse *)C->product->data; 2701 A = product->A; 2702 B = product->B; 2703 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2704 mmdata->reusesym = PETSC_FALSE; 2705 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2706 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2707 Cmat = Ccusp->mat; 2708 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]); 2709 Ccsr = (CsrMatrix *)Cmat->mat; 2710 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2711 goto finalize; 2712 } 2713 if (!c->nz) goto finalize; 2714 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2715 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2716 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 2717 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 2718 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2719 PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2720 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2721 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 2722 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2723 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2724 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2725 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2726 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2727 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2728 2729 ptype = product->type; 2730 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 2731 ptype = MATPRODUCT_AB; 2732 PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric"); 2733 } 2734 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 2735 ptype = MATPRODUCT_AB; 2736 PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric"); 2737 } 2738 switch (ptype) { 2739 case MATPRODUCT_AB: 2740 Amat = Acusp->mat; 2741 Bmat = Bcusp->mat; 2742 break; 2743 case MATPRODUCT_AtB: 2744 Amat = Acusp->matTranspose; 2745 Bmat = Bcusp->mat; 2746 break; 2747 case MATPRODUCT_ABt: 2748 Amat = Acusp->mat; 2749 Bmat = Bcusp->matTranspose; 2750 break; 2751 default: 2752 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2753 } 2754 Cmat = Ccusp->mat; 2755 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 2756 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 2757 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]); 2758 Acsr = (CsrMatrix *)Amat->mat; 2759 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */ 2760 Ccsr = (CsrMatrix *)Cmat->mat; 2761 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 2762 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 2763 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2764 PetscCall(PetscLogGpuTimeBegin()); 2765 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2766 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2767 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2768 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2769 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2770 PetscCallCUSPARSE(stat); 2771 #else 2772 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 2773 PetscCallCUSPARSE(stat); 2774 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2775 PetscCallCUSPARSE(stat); 2776 #endif 2777 #else 2778 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 2779 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 2780 PetscCallCUSPARSE(stat); 2781 #endif 2782 PetscCall(PetscLogGpuFlops(mmdata->flops)); 2783 PetscCallCUDA(WaitForCUDA()); 2784 PetscCall(PetscLogGpuTimeEnd()); 2785 C->offloadmask = PETSC_OFFLOAD_GPU; 2786 finalize: 2787 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2788 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 2789 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 2790 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 2791 c->reallocs = 0; 2792 C->info.mallocs += 0; 2793 C->info.nz_unneeded = 0; 2794 C->assembled = C->was_assembled = PETSC_TRUE; 2795 C->num_ass++; 2796 PetscFunctionReturn(0); 2797 } 2798 2799 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2800 { 2801 Mat_Product *product = C->product; 2802 Mat A, B; 2803 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 2804 Mat_SeqAIJ *a, *b, *c; 2805 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 2806 CsrMatrix *Acsr, *Bcsr, *Ccsr; 2807 PetscInt i, j, m, n, k; 2808 PetscBool flg; 2809 cusparseStatus_t stat; 2810 MatProductType ptype; 2811 MatMatCusparse *mmdata; 2812 PetscLogDouble flops; 2813 PetscBool biscompressed, ciscompressed; 2814 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2815 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2816 cusparseSpMatDescr_t BmatSpDescr; 2817 #else 2818 int cnz; 2819 #endif 2820 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2821 2822 PetscFunctionBegin; 2823 MatCheckProduct(C, 1); 2824 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 2825 A = product->A; 2826 B = product->B; 2827 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2828 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2829 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 2830 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 2831 a = (Mat_SeqAIJ *)A->data; 2832 b = (Mat_SeqAIJ *)B->data; 2833 /* product data */ 2834 PetscCall(PetscNew(&mmdata)); 2835 C->product->data = mmdata; 2836 C->product->destroy = MatDestroy_MatMatCusparse; 2837 2838 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2839 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2840 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 2841 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 2842 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2843 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2844 2845 ptype = product->type; 2846 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 2847 ptype = MATPRODUCT_AB; 2848 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 2849 } 2850 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 2851 ptype = MATPRODUCT_AB; 2852 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 2853 } 2854 biscompressed = PETSC_FALSE; 2855 ciscompressed = PETSC_FALSE; 2856 switch (ptype) { 2857 case MATPRODUCT_AB: 2858 m = A->rmap->n; 2859 n = B->cmap->n; 2860 k = A->cmap->n; 2861 Amat = Acusp->mat; 2862 Bmat = Bcusp->mat; 2863 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2864 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2865 break; 2866 case MATPRODUCT_AtB: 2867 m = A->cmap->n; 2868 n = B->cmap->n; 2869 k = A->rmap->n; 2870 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2871 Amat = Acusp->matTranspose; 2872 Bmat = Bcusp->mat; 2873 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2874 break; 2875 case MATPRODUCT_ABt: 2876 m = A->rmap->n; 2877 n = B->rmap->n; 2878 k = A->cmap->n; 2879 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 2880 Amat = Acusp->mat; 2881 Bmat = Bcusp->matTranspose; 2882 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2883 break; 2884 default: 2885 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2886 } 2887 2888 /* create cusparse matrix */ 2889 PetscCall(MatSetSizes(C, m, n, m, n)); 2890 PetscCall(MatSetType(C, MATSEQAIJCUSPARSE)); 2891 c = (Mat_SeqAIJ *)C->data; 2892 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2893 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2894 Ccsr = new CsrMatrix; 2895 2896 c->compressedrow.use = ciscompressed; 2897 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2898 c->compressedrow.nrows = a->compressedrow.nrows; 2899 PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex)); 2900 PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows)); 2901 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2902 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2903 Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows); 2904 } else { 2905 c->compressedrow.nrows = 0; 2906 c->compressedrow.i = NULL; 2907 c->compressedrow.rindex = NULL; 2908 Ccusp->workVector = NULL; 2909 Cmat->cprowIndices = NULL; 2910 } 2911 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2912 Ccusp->mat = Cmat; 2913 Ccusp->mat->mat = Ccsr; 2914 Ccsr->num_rows = Ccusp->nrows; 2915 Ccsr->num_cols = n; 2916 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1); 2917 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 2918 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 2919 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 2920 PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar))); 2921 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar))); 2922 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 2923 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2924 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2925 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2926 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2927 thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0); 2928 c->nz = 0; 2929 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2930 Ccsr->values = new THRUSTARRAY(c->nz); 2931 goto finalizesym; 2932 } 2933 2934 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 2935 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 2936 Acsr = (CsrMatrix *)Amat->mat; 2937 if (!biscompressed) { 2938 Bcsr = (CsrMatrix *)Bmat->mat; 2939 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2940 BmatSpDescr = Bmat->matDescr; 2941 #endif 2942 } else { /* we need to use row offsets for the full matrix */ 2943 CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat; 2944 Bcsr = new CsrMatrix; 2945 Bcsr->num_rows = B->rmap->n; 2946 Bcsr->num_cols = cBcsr->num_cols; 2947 Bcsr->num_entries = cBcsr->num_entries; 2948 Bcsr->column_indices = cBcsr->column_indices; 2949 Bcsr->values = cBcsr->values; 2950 if (!Bcusp->rowoffsets_gpu) { 2951 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2952 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 2953 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 2954 } 2955 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2956 mmdata->Bcsr = Bcsr; 2957 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2958 if (Bcsr->num_rows && Bcsr->num_cols) { 2959 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2960 PetscCallCUSPARSE(stat); 2961 } 2962 BmatSpDescr = mmdata->matSpBDescr; 2963 #endif 2964 } 2965 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 2966 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 2967 /* precompute flops count */ 2968 if (ptype == MATPRODUCT_AB) { 2969 for (i = 0, flops = 0; i < A->rmap->n; i++) { 2970 const PetscInt st = a->i[i]; 2971 const PetscInt en = a->i[i + 1]; 2972 for (j = st; j < en; j++) { 2973 const PetscInt brow = a->j[j]; 2974 flops += 2. * (b->i[brow + 1] - b->i[brow]); 2975 } 2976 } 2977 } else if (ptype == MATPRODUCT_AtB) { 2978 for (i = 0, flops = 0; i < A->rmap->n; i++) { 2979 const PetscInt anzi = a->i[i + 1] - a->i[i]; 2980 const PetscInt bnzi = b->i[i + 1] - b->i[i]; 2981 flops += (2. * anzi) * bnzi; 2982 } 2983 } else { /* TODO */ 2984 flops = 0.; 2985 } 2986 2987 mmdata->flops = flops; 2988 PetscCall(PetscLogGpuTimeBegin()); 2989 2990 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2991 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2992 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, NULL, NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2993 PetscCallCUSPARSE(stat); 2994 PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 2995 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2996 { 2997 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 2998 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 2999 */ 3000 void *dBuffer1 = NULL; 3001 void *dBuffer2 = NULL; 3002 void *dBuffer3 = NULL; 3003 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 3004 size_t bufferSize1 = 0; 3005 size_t bufferSize2 = 0; 3006 size_t bufferSize3 = 0; 3007 size_t bufferSize4 = 0; 3008 size_t bufferSize5 = 0; 3009 3010 /*----------------------------------------------------------------------*/ 3011 /* ask bufferSize1 bytes for external memory */ 3012 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL); 3013 PetscCallCUSPARSE(stat); 3014 PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1)); 3015 /* inspect the matrices A and B to understand the memory requirement for the next step */ 3016 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1); 3017 PetscCallCUSPARSE(stat); 3018 3019 /*----------------------------------------------------------------------*/ 3020 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL); 3021 PetscCallCUSPARSE(stat); 3022 PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2)); 3023 PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3)); 3024 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4)); 3025 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4); 3026 PetscCallCUSPARSE(stat); 3027 PetscCallCUDA(cudaFree(dBuffer1)); 3028 PetscCallCUDA(cudaFree(dBuffer2)); 3029 3030 /*----------------------------------------------------------------------*/ 3031 /* get matrix C non-zero entries C_nnz1 */ 3032 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3033 c->nz = (PetscInt)C_nnz1; 3034 /* allocate matrix C */ 3035 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3036 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3037 Ccsr->values = new THRUSTARRAY(c->nz); 3038 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3039 /* update matC with the new pointers */ 3040 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 3041 PetscCallCUSPARSE(stat); 3042 3043 /*----------------------------------------------------------------------*/ 3044 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL); 3045 PetscCallCUSPARSE(stat); 3046 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5)); 3047 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5); 3048 PetscCallCUSPARSE(stat); 3049 PetscCallCUDA(cudaFree(dBuffer3)); 3050 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 3051 PetscCallCUSPARSE(stat); 3052 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024)); 3053 } 3054 #else 3055 size_t bufSize2; 3056 /* ask bufferSize bytes for external memory */ 3057 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL); 3058 PetscCallCUSPARSE(stat); 3059 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2)); 3060 /* inspect the matrices A and B to understand the memory requirement for the next step */ 3061 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2); 3062 PetscCallCUSPARSE(stat); 3063 /* ask bufferSize again bytes for external memory */ 3064 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL); 3065 PetscCallCUSPARSE(stat); 3066 /* The CUSPARSE documentation is not clear, nor the API 3067 We need both buffers to perform the operations properly! 3068 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 3069 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 3070 is stored in the descriptor! What a messy API... */ 3071 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize)); 3072 /* compute the intermediate product of A * B */ 3073 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 3074 PetscCallCUSPARSE(stat); 3075 /* get matrix C non-zero entries C_nnz1 */ 3076 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3077 c->nz = (PetscInt)C_nnz1; 3078 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024, 3079 mmdata->mmBufferSize / 1024)); 3080 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3081 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3082 Ccsr->values = new THRUSTARRAY(c->nz); 3083 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3084 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 3085 PetscCallCUSPARSE(stat); 3086 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 3087 PetscCallCUSPARSE(stat); 3088 #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3089 #else 3090 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST)); 3091 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 3092 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz); 3093 PetscCallCUSPARSE(stat); 3094 c->nz = cnz; 3095 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3096 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3097 Ccsr->values = new THRUSTARRAY(c->nz); 3098 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3099 3100 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3101 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 3102 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 3103 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 3104 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 3105 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 3106 PetscCallCUSPARSE(stat); 3107 #endif 3108 PetscCall(PetscLogGpuFlops(mmdata->flops)); 3109 PetscCall(PetscLogGpuTimeEnd()); 3110 finalizesym: 3111 c->singlemalloc = PETSC_FALSE; 3112 c->free_a = PETSC_TRUE; 3113 c->free_ij = PETSC_TRUE; 3114 PetscCall(PetscMalloc1(m + 1, &c->i)); 3115 PetscCall(PetscMalloc1(c->nz, &c->j)); 3116 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 3117 PetscInt *d_i = c->i; 3118 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 3119 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 3120 ii = *Ccsr->row_offsets; 3121 jj = *Ccsr->column_indices; 3122 if (ciscompressed) d_i = c->compressedrow.i; 3123 PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3124 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3125 } else { 3126 PetscInt *d_i = c->i; 3127 if (ciscompressed) d_i = c->compressedrow.i; 3128 PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3129 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3130 } 3131 if (ciscompressed) { /* need to expand host row offsets */ 3132 PetscInt r = 0; 3133 c->i[0] = 0; 3134 for (k = 0; k < c->compressedrow.nrows; k++) { 3135 const PetscInt next = c->compressedrow.rindex[k]; 3136 const PetscInt old = c->compressedrow.i[k]; 3137 for (; r < next; r++) c->i[r + 1] = old; 3138 } 3139 for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows]; 3140 } 3141 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 3142 PetscCall(PetscMalloc1(m, &c->ilen)); 3143 PetscCall(PetscMalloc1(m, &c->imax)); 3144 c->maxnz = c->nz; 3145 c->nonzerorowcnt = 0; 3146 c->rmax = 0; 3147 for (k = 0; k < m; k++) { 3148 const PetscInt nn = c->i[k + 1] - c->i[k]; 3149 c->ilen[k] = c->imax[k] = nn; 3150 c->nonzerorowcnt += (PetscInt) !!nn; 3151 c->rmax = PetscMax(c->rmax, nn); 3152 } 3153 PetscCall(MatMarkDiagonal_SeqAIJ(C)); 3154 PetscCall(PetscMalloc1(c->nz, &c->a)); 3155 Ccsr->num_entries = c->nz; 3156 3157 C->nonzerostate++; 3158 PetscCall(PetscLayoutSetUp(C->rmap)); 3159 PetscCall(PetscLayoutSetUp(C->cmap)); 3160 Ccusp->nonzerostate = C->nonzerostate; 3161 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3162 C->preallocated = PETSC_TRUE; 3163 C->assembled = PETSC_FALSE; 3164 C->was_assembled = PETSC_FALSE; 3165 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 3166 mmdata->reusesym = PETSC_TRUE; 3167 C->offloadmask = PETSC_OFFLOAD_GPU; 3168 } 3169 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3170 PetscFunctionReturn(0); 3171 } 3172 3173 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 3174 3175 /* handles sparse or dense B */ 3176 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 3177 { 3178 Mat_Product *product = mat->product; 3179 PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE; 3180 3181 PetscFunctionBegin; 3182 MatCheckProduct(mat, 1); 3183 PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense)); 3184 if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp)); 3185 if (product->type == MATPRODUCT_ABC) { 3186 Ciscusp = PETSC_FALSE; 3187 if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp)); 3188 } 3189 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 3190 PetscBool usecpu = PETSC_FALSE; 3191 switch (product->type) { 3192 case MATPRODUCT_AB: 3193 if (product->api_user) { 3194 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat"); 3195 PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3196 PetscOptionsEnd(); 3197 } else { 3198 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat"); 3199 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3200 PetscOptionsEnd(); 3201 } 3202 break; 3203 case MATPRODUCT_AtB: 3204 if (product->api_user) { 3205 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat"); 3206 PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3207 PetscOptionsEnd(); 3208 } else { 3209 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat"); 3210 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3211 PetscOptionsEnd(); 3212 } 3213 break; 3214 case MATPRODUCT_PtAP: 3215 if (product->api_user) { 3216 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat"); 3217 PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3218 PetscOptionsEnd(); 3219 } else { 3220 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat"); 3221 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3222 PetscOptionsEnd(); 3223 } 3224 break; 3225 case MATPRODUCT_RARt: 3226 if (product->api_user) { 3227 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat"); 3228 PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3229 PetscOptionsEnd(); 3230 } else { 3231 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat"); 3232 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3233 PetscOptionsEnd(); 3234 } 3235 break; 3236 case MATPRODUCT_ABC: 3237 if (product->api_user) { 3238 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat"); 3239 PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3240 PetscOptionsEnd(); 3241 } else { 3242 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat"); 3243 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3244 PetscOptionsEnd(); 3245 } 3246 break; 3247 default: 3248 break; 3249 } 3250 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 3251 } 3252 /* dispatch */ 3253 if (isdense) { 3254 switch (product->type) { 3255 case MATPRODUCT_AB: 3256 case MATPRODUCT_AtB: 3257 case MATPRODUCT_ABt: 3258 case MATPRODUCT_PtAP: 3259 case MATPRODUCT_RARt: 3260 if (product->A->boundtocpu) { 3261 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 3262 } else { 3263 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 3264 } 3265 break; 3266 case MATPRODUCT_ABC: 3267 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3268 break; 3269 default: 3270 break; 3271 } 3272 } else if (Biscusp && Ciscusp) { 3273 switch (product->type) { 3274 case MATPRODUCT_AB: 3275 case MATPRODUCT_AtB: 3276 case MATPRODUCT_ABt: 3277 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3278 break; 3279 case MATPRODUCT_PtAP: 3280 case MATPRODUCT_RARt: 3281 case MATPRODUCT_ABC: 3282 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3283 break; 3284 default: 3285 break; 3286 } 3287 } else { /* fallback for AIJ */ 3288 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 3289 } 3290 PetscFunctionReturn(0); 3291 } 3292 3293 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3294 { 3295 PetscFunctionBegin; 3296 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE)); 3297 PetscFunctionReturn(0); 3298 } 3299 3300 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3301 { 3302 PetscFunctionBegin; 3303 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE)); 3304 PetscFunctionReturn(0); 3305 } 3306 3307 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3308 { 3309 PetscFunctionBegin; 3310 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE)); 3311 PetscFunctionReturn(0); 3312 } 3313 3314 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3315 { 3316 PetscFunctionBegin; 3317 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE)); 3318 PetscFunctionReturn(0); 3319 } 3320 3321 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3322 { 3323 PetscFunctionBegin; 3324 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE)); 3325 PetscFunctionReturn(0); 3326 } 3327 3328 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y) 3329 { 3330 int i = blockIdx.x * blockDim.x + threadIdx.x; 3331 if (i < n) y[idx[i]] += x[i]; 3332 } 3333 3334 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 3335 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm) 3336 { 3337 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3338 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 3339 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3340 PetscScalar *xarray, *zarray, *dptr, *beta, *xptr; 3341 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3342 PetscBool compressed; 3343 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3344 PetscInt nx, ny; 3345 #endif 3346 3347 PetscFunctionBegin; 3348 PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported"); 3349 if (!a->nz) { 3350 if (!yy) PetscCall(VecSet_SeqCUDA(zz, 0)); 3351 else PetscCall(VecCopy_SeqCUDA(yy, zz)); 3352 PetscFunctionReturn(0); 3353 } 3354 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 3355 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3356 if (!trans) { 3357 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3358 PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3359 } else { 3360 if (herm || !A->form_explicit_transpose) { 3361 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3362 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3363 } else { 3364 if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3365 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 3366 } 3367 } 3368 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3369 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3370 3371 try { 3372 PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray)); 3373 if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get uptodate zarray on GPU */ 3374 else PetscCall(VecCUDAGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */ 3375 3376 PetscCall(PetscLogGpuTimeBegin()); 3377 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3378 /* z = A x + beta y. 3379 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3380 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3381 */ 3382 xptr = xarray; 3383 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3384 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3385 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3386 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3387 allocated to accommodate different uses. So we get the length info directly from mat. 3388 */ 3389 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3390 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3391 nx = mat->num_cols; 3392 ny = mat->num_rows; 3393 } 3394 #endif 3395 } else { 3396 /* z = A^T x + beta y 3397 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3398 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3399 */ 3400 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3401 dptr = zarray; 3402 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3403 if (compressed) { /* Scatter x to work vector */ 3404 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3405 3406 thrust::for_each( 3407 #if PetscDefined(HAVE_THRUST_ASYNC) 3408 thrust::cuda::par.on(PetscDefaultCudaStream), 3409 #endif 3410 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3411 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse()); 3412 } 3413 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3414 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3415 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3416 nx = mat->num_rows; 3417 ny = mat->num_cols; 3418 } 3419 #endif 3420 } 3421 3422 /* csr_spmv does y = alpha op(A) x + beta y */ 3423 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3424 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3425 PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3426 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3427 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype)); 3428 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype)); 3429 PetscCallCUSPARSE( 3430 cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize)); 3431 PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize)); 3432 3433 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3434 } else { 3435 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3436 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr)); 3437 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr)); 3438 } 3439 3440 PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 3441 matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer)); 3442 #else 3443 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3444 PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr)); 3445 #endif 3446 } else { 3447 if (cusparsestruct->nrows) { 3448 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3449 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3450 #else 3451 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3452 PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr)); 3453 #endif 3454 } 3455 } 3456 PetscCall(PetscLogGpuTimeEnd()); 3457 3458 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3459 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3460 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3461 PetscCall(VecCopy_SeqCUDA(yy, zz)); /* zz = yy */ 3462 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3463 PetscCall(VecAXPY_SeqCUDA(zz, 1.0, yy)); /* zz += yy */ 3464 } 3465 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3466 PetscCall(VecSet_SeqCUDA(zz, 0)); 3467 } 3468 3469 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3470 if (compressed) { 3471 PetscCall(PetscLogGpuTimeBegin()); 3472 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3473 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3474 prevent that. So I just add a ScatterAdd kernel. 3475 */ 3476 #if 0 3477 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3478 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3479 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3480 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3481 VecCUDAPlusEquals()); 3482 #else 3483 PetscInt n = matstruct->cprowIndices->size(); 3484 ScatterAdd<<<(n + 255) / 256, 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray); 3485 #endif 3486 PetscCall(PetscLogGpuTimeEnd()); 3487 } 3488 } else { 3489 if (yy && yy != zz) { PetscCall(VecAXPY_SeqCUDA(zz, 1.0, yy)); /* zz += yy */ } 3490 } 3491 PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray)); 3492 if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray)); 3493 else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray)); 3494 } catch (char *ex) { 3495 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 3496 } 3497 if (yy) { 3498 PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 3499 } else { 3500 PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 3501 } 3502 PetscFunctionReturn(0); 3503 } 3504 3505 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3506 { 3507 PetscFunctionBegin; 3508 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE)); 3509 PetscFunctionReturn(0); 3510 } 3511 3512 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode) 3513 { 3514 PetscObjectState onnz = A->nonzerostate; 3515 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 3516 3517 PetscFunctionBegin; 3518 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 3519 if (onnz != A->nonzerostate && cusp->deviceMat) { 3520 PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n")); 3521 PetscCallCUDA(cudaFree(cusp->deviceMat)); 3522 cusp->deviceMat = NULL; 3523 } 3524 PetscFunctionReturn(0); 3525 } 3526 3527 /* --------------------------------------------------------------------------------*/ 3528 /*@ 3529 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format 3530 (the default parallel PETSc format). This matrix will ultimately pushed down 3531 to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix 3532 assembly performance the user should preallocate the matrix storage by setting 3533 the parameter nz (or the array nnz). By setting these parameters accurately, 3534 performance during matrix assembly can be increased by more than a factor of 50. 3535 3536 Collective 3537 3538 Input Parameters: 3539 + comm - MPI communicator, set to `PETSC_COMM_SELF` 3540 . m - number of rows 3541 . n - number of columns 3542 . nz - number of nonzeros per row (same for all rows) 3543 - nnz - array containing the number of nonzeros in the various rows 3544 (possibly different for each row) or NULL 3545 3546 Output Parameter: 3547 . A - the matrix 3548 3549 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 3550 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3551 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 3552 3553 Notes: 3554 If nnz is given then nz is ignored 3555 3556 The AIJ format, also called 3557 compressed row storage, is fully compatible with standard Fortran 77 3558 storage. That is, the stored row and column indices can begin at 3559 either one (as in Fortran) or zero. See the users' manual for details. 3560 3561 Specify the preallocated storage with either nz or nnz (not both). 3562 Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory 3563 allocation. For large problems you MUST preallocate memory or you 3564 will get TERRIBLE performance, see the users' manual chapter on matrices. 3565 3566 By default, this format uses inodes (identical nodes) when possible, to 3567 improve numerical efficiency of matrix-vector products and solves. We 3568 search for consecutive rows with the same nonzero structure, thereby 3569 reusing matrix information to achieve increased efficiency. 3570 3571 Level: intermediate 3572 3573 .seealso: `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE` 3574 @*/ 3575 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 3576 { 3577 PetscFunctionBegin; 3578 PetscCall(MatCreate(comm, A)); 3579 PetscCall(MatSetSizes(*A, m, n, m, n)); 3580 PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE)); 3581 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 3582 PetscFunctionReturn(0); 3583 } 3584 3585 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3586 { 3587 PetscFunctionBegin; 3588 if (A->factortype == MAT_FACTOR_NONE) { 3589 PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE **)&A->spptr)); 3590 } else { 3591 PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr)); 3592 } 3593 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 3594 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL)); 3595 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL)); 3596 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 3597 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 3598 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 3599 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 3600 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 3601 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 3602 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL)); 3603 PetscCall(MatDestroy_SeqAIJ(A)); 3604 PetscFunctionReturn(0); 3605 } 3606 3607 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *); 3608 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool); 3609 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B) 3610 { 3611 PetscFunctionBegin; 3612 PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B)); 3613 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B)); 3614 PetscFunctionReturn(0); 3615 } 3616 3617 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str) 3618 { 3619 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 3620 Mat_SeqAIJCUSPARSE *cy; 3621 Mat_SeqAIJCUSPARSE *cx; 3622 PetscScalar *ay; 3623 const PetscScalar *ax; 3624 CsrMatrix *csry, *csrx; 3625 3626 PetscFunctionBegin; 3627 cy = (Mat_SeqAIJCUSPARSE *)Y->spptr; 3628 cx = (Mat_SeqAIJCUSPARSE *)X->spptr; 3629 if (X->ops->axpy != Y->ops->axpy) { 3630 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3631 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3632 PetscFunctionReturn(0); 3633 } 3634 /* if we are here, it means both matrices are bound to GPU */ 3635 PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y)); 3636 PetscCall(MatSeqAIJCUSPARSECopyToGPU(X)); 3637 PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3638 PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3639 csry = (CsrMatrix *)cy->mat->mat; 3640 csrx = (CsrMatrix *)cx->mat->mat; 3641 /* see if we can turn this into a cublas axpy */ 3642 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3643 bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin()); 3644 if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin()); 3645 if (eq) str = SAME_NONZERO_PATTERN; 3646 } 3647 /* spgeam is buggy with one column */ 3648 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3649 3650 if (str == SUBSET_NONZERO_PATTERN) { 3651 PetscScalar b = 1.0; 3652 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3653 size_t bufferSize; 3654 void *buffer; 3655 #endif 3656 3657 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 3658 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3659 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST)); 3660 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3661 PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3662 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize)); 3663 PetscCallCUDA(cudaMalloc(&buffer, bufferSize)); 3664 PetscCall(PetscLogGpuTimeBegin()); 3665 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3666 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer)); 3667 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3668 PetscCall(PetscLogGpuTimeEnd()); 3669 PetscCallCUDA(cudaFree(buffer)); 3670 #else 3671 PetscCall(PetscLogGpuTimeBegin()); 3672 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3673 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get())); 3674 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3675 PetscCall(PetscLogGpuTimeEnd()); 3676 #endif 3677 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3678 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 3679 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3680 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3681 } else if (str == SAME_NONZERO_PATTERN) { 3682 cublasHandle_t cublasv2handle; 3683 PetscBLASInt one = 1, bnz = 1; 3684 3685 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 3686 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3687 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3688 PetscCall(PetscBLASIntCast(x->nz, &bnz)); 3689 PetscCall(PetscLogGpuTimeBegin()); 3690 PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one)); 3691 PetscCall(PetscLogGpuFlops(2.0 * bnz)); 3692 PetscCall(PetscLogGpuTimeEnd()); 3693 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 3694 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3695 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3696 } else { 3697 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3698 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3699 } 3700 PetscFunctionReturn(0); 3701 } 3702 3703 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a) 3704 { 3705 Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data; 3706 PetscScalar *ay; 3707 cublasHandle_t cublasv2handle; 3708 PetscBLASInt one = 1, bnz = 1; 3709 3710 PetscFunctionBegin; 3711 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3712 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3713 PetscCall(PetscBLASIntCast(y->nz, &bnz)); 3714 PetscCall(PetscLogGpuTimeBegin()); 3715 PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one)); 3716 PetscCall(PetscLogGpuFlops(bnz)); 3717 PetscCall(PetscLogGpuTimeEnd()); 3718 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3719 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3720 PetscFunctionReturn(0); 3721 } 3722 3723 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3724 { 3725 PetscBool both = PETSC_FALSE; 3726 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3727 3728 PetscFunctionBegin; 3729 if (A->factortype == MAT_FACTOR_NONE) { 3730 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr; 3731 if (spptr->mat) { 3732 CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat; 3733 if (matrix->values) { 3734 both = PETSC_TRUE; 3735 thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3736 } 3737 } 3738 if (spptr->matTranspose) { 3739 CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat; 3740 if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3741 } 3742 } 3743 PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n])); 3744 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 3745 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3746 else A->offloadmask = PETSC_OFFLOAD_CPU; 3747 PetscFunctionReturn(0); 3748 } 3749 3750 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg) 3751 { 3752 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3753 3754 PetscFunctionBegin; 3755 if (A->factortype != MAT_FACTOR_NONE) { 3756 A->boundtocpu = flg; 3757 PetscFunctionReturn(0); 3758 } 3759 if (flg) { 3760 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 3761 3762 A->ops->scale = MatScale_SeqAIJ; 3763 A->ops->axpy = MatAXPY_SeqAIJ; 3764 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3765 A->ops->mult = MatMult_SeqAIJ; 3766 A->ops->multadd = MatMultAdd_SeqAIJ; 3767 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3768 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3769 A->ops->multhermitiantranspose = NULL; 3770 A->ops->multhermitiantransposeadd = NULL; 3771 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3772 PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps))); 3773 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 3774 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 3775 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 3776 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 3777 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 3778 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 3779 } else { 3780 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3781 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3782 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3783 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3784 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3785 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3786 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3787 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3788 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3789 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3790 a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 3791 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 3792 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 3793 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 3794 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 3795 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 3796 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE; 3797 3798 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE)); 3799 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 3800 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 3801 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE)); 3802 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE)); 3803 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 3804 } 3805 A->boundtocpu = flg; 3806 if (flg && a->inode.size) { 3807 a->inode.use = PETSC_TRUE; 3808 } else { 3809 a->inode.use = PETSC_FALSE; 3810 } 3811 PetscFunctionReturn(0); 3812 } 3813 3814 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 3815 { 3816 Mat B; 3817 3818 PetscFunctionBegin; 3819 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */ 3820 if (reuse == MAT_INITIAL_MATRIX) { 3821 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); 3822 } else if (reuse == MAT_REUSE_MATRIX) { 3823 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); 3824 } 3825 B = *newmat; 3826 3827 PetscCall(PetscFree(B->defaultvectype)); 3828 PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype)); 3829 3830 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3831 if (B->factortype == MAT_FACTOR_NONE) { 3832 Mat_SeqAIJCUSPARSE *spptr; 3833 PetscCall(PetscNew(&spptr)); 3834 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 3835 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 3836 spptr->format = MAT_CUSPARSE_CSR; 3837 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3838 #if CUSPARSE_VERSION > 11301 3839 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 3840 #else 3841 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3842 #endif 3843 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3844 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3845 #endif 3846 B->spptr = spptr; 3847 } else { 3848 Mat_SeqAIJCUSPARSETriFactors *spptr; 3849 3850 PetscCall(PetscNew(&spptr)); 3851 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 3852 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 3853 B->spptr = spptr; 3854 } 3855 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3856 } 3857 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3858 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3859 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 3860 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3861 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3862 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3863 3864 PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE)); 3865 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE)); 3866 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE)); 3867 #if defined(PETSC_HAVE_HYPRE) 3868 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE)); 3869 #endif 3870 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE)); 3871 PetscFunctionReturn(0); 3872 } 3873 3874 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3875 { 3876 PetscFunctionBegin; 3877 PetscCall(MatCreate_SeqAIJ(B)); 3878 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B)); 3879 PetscFunctionReturn(0); 3880 } 3881 3882 /*MC 3883 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3884 3885 A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either 3886 CSR, ELL, or Hybrid format. 3887 All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library. 3888 3889 Options Database Keys: 3890 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()` 3891 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`) during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid). 3892 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`) during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid). 3893 + -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU 3894 3895 Level: beginner 3896 3897 .seealso: `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 3898 M*/ 3899 3900 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat, MatFactorType, Mat *); 3901 3902 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3903 { 3904 PetscFunctionBegin; 3905 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse_band)); 3906 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse)); 3907 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse)); 3908 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse)); 3909 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse)); 3910 3911 PetscFunctionReturn(0); 3912 } 3913 3914 static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat) 3915 { 3916 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)mat->spptr; 3917 3918 PetscFunctionBegin; 3919 if (!cusp) PetscFunctionReturn(0); 3920 delete cusp->cooPerm; 3921 delete cusp->cooPerm_a; 3922 cusp->cooPerm = NULL; 3923 cusp->cooPerm_a = NULL; 3924 if (cusp->use_extended_coo) { 3925 PetscCallCUDA(cudaFree(cusp->jmap_d)); 3926 PetscCallCUDA(cudaFree(cusp->perm_d)); 3927 } 3928 cusp->use_extended_coo = PETSC_FALSE; 3929 PetscFunctionReturn(0); 3930 } 3931 3932 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3933 { 3934 PetscFunctionBegin; 3935 if (*cusparsestruct) { 3936 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat, (*cusparsestruct)->format)); 3937 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose, (*cusparsestruct)->format)); 3938 delete (*cusparsestruct)->workVector; 3939 delete (*cusparsestruct)->rowoffsets_gpu; 3940 delete (*cusparsestruct)->cooPerm; 3941 delete (*cusparsestruct)->cooPerm_a; 3942 delete (*cusparsestruct)->csr2csc_i; 3943 if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle)); 3944 if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d)); 3945 if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d)); 3946 PetscCall(PetscFree(*cusparsestruct)); 3947 } 3948 PetscFunctionReturn(0); 3949 } 3950 3951 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3952 { 3953 PetscFunctionBegin; 3954 if (*mat) { 3955 delete (*mat)->values; 3956 delete (*mat)->column_indices; 3957 delete (*mat)->row_offsets; 3958 delete *mat; 3959 *mat = 0; 3960 } 3961 PetscFunctionReturn(0); 3962 } 3963 3964 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3965 { 3966 PetscFunctionBegin; 3967 if (*trifactor) { 3968 if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr)); 3969 if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo)); 3970 PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 3971 if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer)); 3972 if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h)); 3973 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3974 if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer)); 3975 #endif 3976 PetscCall(PetscFree(*trifactor)); 3977 } 3978 PetscFunctionReturn(0); 3979 } 3980 3981 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format) 3982 { 3983 CsrMatrix *mat; 3984 3985 PetscFunctionBegin; 3986 if (*matstruct) { 3987 if ((*matstruct)->mat) { 3988 if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) { 3989 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3990 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3991 #else 3992 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3993 PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat)); 3994 #endif 3995 } else { 3996 mat = (CsrMatrix *)(*matstruct)->mat; 3997 CsrMatrix_Destroy(&mat); 3998 } 3999 } 4000 if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr)); 4001 delete (*matstruct)->cprowIndices; 4002 if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one)); 4003 if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero)); 4004 if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one)); 4005 4006 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4007 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 4008 if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr)); 4009 for (int i = 0; i < 3; i++) { 4010 if (mdata->cuSpMV[i].initialized) { 4011 PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer)); 4012 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr)); 4013 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr)); 4014 } 4015 } 4016 #endif 4017 delete *matstruct; 4018 *matstruct = NULL; 4019 } 4020 PetscFunctionReturn(0); 4021 } 4022 4023 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors) 4024 { 4025 Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors; 4026 4027 PetscFunctionBegin; 4028 if (fs) { 4029 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr)); 4030 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr)); 4031 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose)); 4032 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose)); 4033 delete fs->rpermIndices; 4034 delete fs->cpermIndices; 4035 delete fs->workVector; 4036 fs->rpermIndices = NULL; 4037 fs->cpermIndices = NULL; 4038 fs->workVector = NULL; 4039 if (fs->a_band_d) PetscCallCUDA(cudaFree(fs->a_band_d)); 4040 if (fs->i_band_d) PetscCallCUDA(cudaFree(fs->i_band_d)); 4041 fs->init_dev_prop = PETSC_FALSE; 4042 #if CUSPARSE_VERSION >= 11500 4043 PetscCallCUDA(cudaFree(fs->csrRowPtr)); 4044 PetscCallCUDA(cudaFree(fs->csrColIdx)); 4045 PetscCallCUDA(cudaFree(fs->csrVal)); 4046 PetscCallCUDA(cudaFree(fs->X)); 4047 PetscCallCUDA(cudaFree(fs->Y)); 4048 // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */ 4049 PetscCallCUDA(cudaFree(fs->spsvBuffer_L)); 4050 PetscCallCUDA(cudaFree(fs->spsvBuffer_U)); 4051 PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt)); 4052 PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut)); 4053 PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M)); 4054 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L)); 4055 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U)); 4056 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L)); 4057 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt)); 4058 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U)); 4059 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut)); 4060 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X)); 4061 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y)); 4062 PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M)); 4063 PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M)); 4064 4065 fs->createdTransposeSpSVDescr = PETSC_FALSE; 4066 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 4067 #endif 4068 } 4069 PetscFunctionReturn(0); 4070 } 4071 4072 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors) 4073 { 4074 cusparseHandle_t handle; 4075 4076 PetscFunctionBegin; 4077 if (*trifactors) { 4078 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors)); 4079 if (handle = (*trifactors)->handle) PetscCallCUSPARSE(cusparseDestroy(handle)); 4080 PetscCall(PetscFree(*trifactors)); 4081 } 4082 PetscFunctionReturn(0); 4083 } 4084 4085 struct IJCompare { 4086 __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 4087 { 4088 if (t1.get<0>() < t2.get<0>()) return true; 4089 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4090 return false; 4091 } 4092 }; 4093 4094 struct IJEqual { 4095 __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 4096 { 4097 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 4098 return true; 4099 } 4100 }; 4101 4102 struct IJDiff { 4103 __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 == t2 ? 0 : 1; } 4104 }; 4105 4106 struct IJSum { 4107 __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 || t2; } 4108 }; 4109 4110 #include <thrust/iterator/discard_iterator.h> 4111 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */ 4112 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 4113 { 4114 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4115 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4116 THRUSTARRAY *cooPerm_v = NULL; 4117 thrust::device_ptr<const PetscScalar> d_v; 4118 CsrMatrix *matrix; 4119 PetscInt n; 4120 4121 PetscFunctionBegin; 4122 PetscCheck(cusp, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE struct"); 4123 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE CsrMatrix"); 4124 if (!cusp->cooPerm) { 4125 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 4126 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 4127 PetscFunctionReturn(0); 4128 } 4129 matrix = (CsrMatrix *)cusp->mat->mat; 4130 PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4131 if (!v) { 4132 if (imode == INSERT_VALUES) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 4133 goto finalize; 4134 } 4135 n = cusp->cooPerm->size(); 4136 if (isCudaMem(v)) { 4137 d_v = thrust::device_pointer_cast(v); 4138 } else { 4139 cooPerm_v = new THRUSTARRAY(n); 4140 cooPerm_v->assign(v, v + n); 4141 d_v = cooPerm_v->data(); 4142 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 4143 } 4144 PetscCall(PetscLogGpuTimeBegin()); 4145 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 4146 if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 4147 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 4148 auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()); 4149 /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 4150 cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 4151 cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 4152 */ 4153 thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), cooPerm_w->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>()); 4154 thrust::transform(cooPerm_w->begin(), cooPerm_w->end(), matrix->values->begin(), matrix->values->begin(), thrust::plus<PetscScalar>()); 4155 delete cooPerm_w; 4156 } else { 4157 /* all nonzeros in d_v[] are unique entries */ 4158 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin())); 4159 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end())); 4160 thrust::for_each(zibit, zieit, VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 4161 } 4162 } else { 4163 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 4164 auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()); 4165 thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), matrix->values->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>()); 4166 } else { 4167 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin())); 4168 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end())); 4169 thrust::for_each(zibit, zieit, VecCUDAEquals()); 4170 } 4171 } 4172 PetscCall(PetscLogGpuTimeEnd()); 4173 finalize: 4174 delete cooPerm_v; 4175 A->offloadmask = PETSC_OFFLOAD_GPU; 4176 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4177 /* shorter version of MatAssemblyEnd_SeqAIJ */ 4178 PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", A->rmap->n, A->cmap->n, a->nz)); 4179 PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is 0\n")); 4180 PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rmax)); 4181 a->reallocs = 0; 4182 A->info.mallocs += 0; 4183 A->info.nz_unneeded = 0; 4184 A->assembled = A->was_assembled = PETSC_TRUE; 4185 A->num_ass++; 4186 PetscFunctionReturn(0); 4187 } 4188 4189 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 4190 { 4191 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4192 4193 PetscFunctionBegin; 4194 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4195 if (!cusp) PetscFunctionReturn(0); 4196 if (destroy) { 4197 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 4198 delete cusp->csr2csc_i; 4199 cusp->csr2csc_i = NULL; 4200 } 4201 A->transupdated = PETSC_FALSE; 4202 PetscFunctionReturn(0); 4203 } 4204 4205 #include <thrust/binary_search.h> 4206 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */ 4207 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, PetscInt coo_i[], PetscInt coo_j[]) 4208 { 4209 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4210 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4211 PetscInt cooPerm_n, nzr = 0; 4212 4213 PetscFunctionBegin; 4214 PetscCall(PetscLayoutSetUp(A->rmap)); 4215 PetscCall(PetscLayoutSetUp(A->cmap)); 4216 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 4217 if (n != cooPerm_n) { 4218 delete cusp->cooPerm; 4219 delete cusp->cooPerm_a; 4220 cusp->cooPerm = NULL; 4221 cusp->cooPerm_a = NULL; 4222 } 4223 if (n) { 4224 thrust::device_ptr<PetscInt> d_i, d_j; 4225 PetscInt *d_raw_i, *d_raw_j; 4226 PetscBool free_raw_i = PETSC_FALSE, free_raw_j = PETSC_FALSE; 4227 PetscMemType imtype, jmtype; 4228 4229 PetscCall(PetscGetMemType(coo_i, &imtype)); 4230 if (PetscMemTypeHost(imtype)) { 4231 PetscCallCUDA(cudaMalloc(&d_raw_i, sizeof(PetscInt) * n)); 4232 PetscCallCUDA(cudaMemcpy(d_raw_i, coo_i, sizeof(PetscInt) * n, cudaMemcpyHostToDevice)); 4233 d_i = thrust::device_pointer_cast(d_raw_i); 4234 free_raw_i = PETSC_TRUE; 4235 PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt))); 4236 } else { 4237 d_i = thrust::device_pointer_cast(coo_i); 4238 } 4239 4240 PetscCall(PetscGetMemType(coo_j, &jmtype)); 4241 if (PetscMemTypeHost(jmtype)) { // MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic() passes device coo_i[] and host coo_j[]! 4242 PetscCallCUDA(cudaMalloc(&d_raw_j, sizeof(PetscInt) * n)); 4243 PetscCallCUDA(cudaMemcpy(d_raw_j, coo_j, sizeof(PetscInt) * n, cudaMemcpyHostToDevice)); 4244 d_j = thrust::device_pointer_cast(d_raw_j); 4245 free_raw_j = PETSC_TRUE; 4246 PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt))); 4247 } else { 4248 d_j = thrust::device_pointer_cast(coo_j); 4249 } 4250 4251 THRUSTINTARRAY ii(A->rmap->n); 4252 4253 if (!cusp->cooPerm) cusp->cooPerm = new THRUSTINTARRAY(n); 4254 if (!cusp->cooPerm_a) cusp->cooPerm_a = new THRUSTINTARRAY(n); 4255 4256 /* Ex. 4257 n = 6 4258 coo_i = [3,3,1,4,1,4] 4259 coo_j = [3,2,2,5,2,6] 4260 */ 4261 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i, d_j)); 4262 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i + n, d_j + n)); 4263 4264 PetscCall(PetscLogGpuTimeBegin()); 4265 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 4266 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 4267 (*cusp->cooPerm_a).assign(d_i, d_i + n); /* copy the sorted array */ 4268 THRUSTINTARRAY w(d_j, d_j + n); 4269 4270 /* 4271 d_i = [1,1,3,3,4,4] 4272 d_j = [2,2,2,3,5,6] 4273 cooPerm = [2,4,1,0,3,5] 4274 */ 4275 auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 4276 4277 /* 4278 d_i = [1,3,3,4,4,x] 4279 ^ekey 4280 d_j = [2,2,3,5,6,x] 4281 ^nekye 4282 */ 4283 if (nekey == ekey) { /* all entries are unique */ 4284 delete cusp->cooPerm_a; 4285 cusp->cooPerm_a = NULL; 4286 } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 4287 /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 4288 adjacent_difference(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 4289 adjacent_difference(w.begin(), w.end(), w.begin(), IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 4290 (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 4291 w[0] = 0; 4292 thrust::transform(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), w.begin(), cusp->cooPerm_a->begin(), IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 4293 thrust::inclusive_scan(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 4294 } 4295 thrust::counting_iterator<PetscInt> search_begin(0); 4296 thrust::upper_bound(d_i, nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 4297 search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 4298 ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 4299 PetscCall(PetscLogGpuTimeEnd()); 4300 4301 PetscCall(MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i)); 4302 a->singlemalloc = PETSC_FALSE; 4303 a->free_a = PETSC_TRUE; 4304 a->free_ij = PETSC_TRUE; 4305 PetscCall(PetscMalloc1(A->rmap->n + 1, &a->i)); 4306 a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 4307 PetscCallCUDA(cudaMemcpy(a->i + 1, ii.data().get(), A->rmap->n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4308 a->nz = a->maxnz = a->i[A->rmap->n]; 4309 a->rmax = 0; 4310 PetscCall(PetscMalloc1(a->nz, &a->a)); 4311 PetscCall(PetscMalloc1(a->nz, &a->j)); 4312 PetscCallCUDA(cudaMemcpy(a->j, thrust::raw_pointer_cast(d_j), a->nz * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4313 if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n, &a->ilen)); 4314 if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n, &a->imax)); 4315 for (PetscInt i = 0; i < A->rmap->n; i++) { 4316 const PetscInt nnzr = a->i[i + 1] - a->i[i]; 4317 nzr += (PetscInt) !!(nnzr); 4318 a->ilen[i] = a->imax[i] = nnzr; 4319 a->rmax = PetscMax(a->rmax, nnzr); 4320 } 4321 a->nonzerorowcnt = nzr; 4322 A->preallocated = PETSC_TRUE; 4323 PetscCall(PetscLogGpuToCpu((A->rmap->n + a->nz) * sizeof(PetscInt))); 4324 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 4325 if (free_raw_i) PetscCallCUDA(cudaFree(d_raw_i)); 4326 if (free_raw_j) PetscCallCUDA(cudaFree(d_raw_j)); 4327 } else { 4328 PetscCall(MatSeqAIJSetPreallocation(A, 0, NULL)); 4329 } 4330 PetscCall(MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE)); 4331 4332 /* We want to allocate the CUSPARSE struct for matvec now. 4333 The code is so convoluted now that I prefer to copy zeros */ 4334 PetscCall(PetscArrayzero(a->a, a->nz)); 4335 PetscCall(MatCheckCompressedRow(A, nzr, &a->compressedrow, a->i, A->rmap->n, 0.6)); 4336 A->offloadmask = PETSC_OFFLOAD_CPU; 4337 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4338 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 4339 PetscFunctionReturn(0); 4340 } 4341 4342 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 4343 { 4344 Mat_SeqAIJ *seq; 4345 Mat_SeqAIJCUSPARSE *dev; 4346 PetscBool coo_basic = PETSC_TRUE; 4347 PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 4348 4349 PetscFunctionBegin; 4350 PetscCall(MatResetPreallocationCOO_SeqAIJ(mat)); 4351 PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat)); 4352 if (coo_i) { 4353 PetscCall(PetscGetMemType(coo_i, &mtype)); 4354 if (PetscMemTypeHost(mtype)) { 4355 for (PetscCount k = 0; k < coo_n; k++) { 4356 if (coo_i[k] < 0 || coo_j[k] < 0) { 4357 coo_basic = PETSC_FALSE; 4358 break; 4359 } 4360 } 4361 } 4362 } 4363 4364 if (coo_basic) { /* i,j are on device or do not contain negative indices */ 4365 PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j)); 4366 } else { 4367 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 4368 mat->offloadmask = PETSC_OFFLOAD_CPU; 4369 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat)); 4370 seq = static_cast<Mat_SeqAIJ *>(mat->data); 4371 dev = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr); 4372 PetscCallCUDA(cudaMalloc((void **)&dev->jmap_d, (seq->nz + 1) * sizeof(PetscCount))); 4373 PetscCallCUDA(cudaMemcpy(dev->jmap_d, seq->jmap, (seq->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4374 PetscCallCUDA(cudaMalloc((void **)&dev->perm_d, seq->Atot * sizeof(PetscCount))); 4375 PetscCallCUDA(cudaMemcpy(dev->perm_d, seq->perm, seq->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4376 dev->use_extended_coo = PETSC_TRUE; 4377 } 4378 PetscFunctionReturn(0); 4379 } 4380 4381 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[]) 4382 { 4383 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 4384 const PetscCount grid_size = gridDim.x * blockDim.x; 4385 for (; i < nnz; i += grid_size) { 4386 PetscScalar sum = 0.0; 4387 for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]]; 4388 a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum; 4389 } 4390 } 4391 4392 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 4393 { 4394 Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data; 4395 Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE *)A->spptr; 4396 PetscCount Annz = seq->nz; 4397 PetscMemType memtype; 4398 const PetscScalar *v1 = v; 4399 PetscScalar *Aa; 4400 4401 PetscFunctionBegin; 4402 if (dev->use_extended_coo) { 4403 PetscCall(PetscGetMemType(v, &memtype)); 4404 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 4405 PetscCallCUDA(cudaMalloc((void **)&v1, seq->coo_n * sizeof(PetscScalar))); 4406 PetscCallCUDA(cudaMemcpy((void *)v1, v, seq->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4407 } 4408 4409 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); 4410 else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); 4411 4412 if (Annz) { 4413 MatAddCOOValues<<<(Annz + 255) / 256, 256>>>(v1, Annz, dev->jmap_d, dev->perm_d, imode, Aa); 4414 PetscCallCUDA(cudaPeekAtLastError()); 4415 } 4416 4417 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa)); 4418 else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa)); 4419 4420 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1)); 4421 } else { 4422 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A, v, imode)); 4423 } 4424 PetscFunctionReturn(0); 4425 } 4426 4427 /*@C 4428 MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for `MATSEQAIJCUSPARSE` matrices. 4429 4430 Not collective 4431 4432 Input Parameters: 4433 + A - the matrix 4434 - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 4435 4436 Output Parameters: 4437 + ia - the CSR row pointers 4438 - ja - the CSR column indices 4439 4440 Level: developer 4441 4442 Note: 4443 When compressed is true, the CSR structure does not contain empty rows 4444 4445 .seealso: `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()` 4446 @*/ 4447 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4448 { 4449 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4450 CsrMatrix *csr; 4451 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4452 4453 PetscFunctionBegin; 4454 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4455 if (!i || !j) PetscFunctionReturn(0); 4456 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4457 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4458 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4459 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4460 csr = (CsrMatrix *)cusp->mat->mat; 4461 if (i) { 4462 if (!compressed && a->compressedrow.use) { /* need full row offset */ 4463 if (!cusp->rowoffsets_gpu) { 4464 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4465 cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 4466 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4467 } 4468 *i = cusp->rowoffsets_gpu->data().get(); 4469 } else *i = csr->row_offsets->data().get(); 4470 } 4471 if (j) *j = csr->column_indices->data().get(); 4472 PetscFunctionReturn(0); 4473 } 4474 4475 /*@C 4476 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with `MatSeqAIJCUSPARSEGetIJ()` 4477 4478 Not collective 4479 4480 Input Parameters: 4481 + A - the matrix 4482 - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 4483 4484 Output Parameters: 4485 + ia - the CSR row pointers 4486 - ja - the CSR column indices 4487 4488 Level: developer 4489 4490 .seealso: `MatSeqAIJCUSPARSEGetIJ()` 4491 @*/ 4492 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4493 { 4494 PetscFunctionBegin; 4495 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4496 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4497 if (i) *i = NULL; 4498 if (j) *j = NULL; 4499 PetscFunctionReturn(0); 4500 } 4501 4502 /*@C 4503 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4504 4505 Not Collective 4506 4507 Input Parameter: 4508 . A - a `MATSEQAIJCUSPARSE` matrix 4509 4510 Output Parameter: 4511 . a - pointer to the device data 4512 4513 Level: developer 4514 4515 Note: 4516 May trigger host-device copies if up-to-date matrix data is on host 4517 4518 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()` 4519 @*/ 4520 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a) 4521 { 4522 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4523 CsrMatrix *csr; 4524 4525 PetscFunctionBegin; 4526 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4527 PetscValidPointer(a, 2); 4528 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4529 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4530 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4531 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4532 csr = (CsrMatrix *)cusp->mat->mat; 4533 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4534 *a = csr->values->data().get(); 4535 PetscFunctionReturn(0); 4536 } 4537 4538 /*@C 4539 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()` 4540 4541 Not Collective 4542 4543 Input Parameter: 4544 . A - a `MATSEQAIJCUSPARSE` matrix 4545 4546 Output Parameter: 4547 . a - pointer to the device data 4548 4549 Level: developer 4550 4551 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()` 4552 @*/ 4553 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a) 4554 { 4555 PetscFunctionBegin; 4556 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4557 PetscValidPointer(a, 2); 4558 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4559 *a = NULL; 4560 PetscFunctionReturn(0); 4561 } 4562 4563 /*@C 4564 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4565 4566 Not Collective 4567 4568 Input Parameter: 4569 . A - a `MATSEQAIJCUSPARSE` matrix 4570 4571 Output Parameter: 4572 . a - pointer to the device data 4573 4574 Level: developer 4575 4576 Note: 4577 May trigger host-device copies if up-to-date matrix data is on host 4578 4579 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()` 4580 @*/ 4581 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a) 4582 { 4583 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4584 CsrMatrix *csr; 4585 4586 PetscFunctionBegin; 4587 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4588 PetscValidPointer(a, 2); 4589 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4590 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4591 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4592 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4593 csr = (CsrMatrix *)cusp->mat->mat; 4594 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4595 *a = csr->values->data().get(); 4596 A->offloadmask = PETSC_OFFLOAD_GPU; 4597 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4598 PetscFunctionReturn(0); 4599 } 4600 /*@C 4601 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()` 4602 4603 Not Collective 4604 4605 Input Parameter: 4606 . A - a `MATSEQAIJCUSPARSE` matrix 4607 4608 Output Parameter: 4609 . a - pointer to the device data 4610 4611 Level: developer 4612 4613 .seealso: `MatSeqAIJCUSPARSEGetArray()` 4614 @*/ 4615 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a) 4616 { 4617 PetscFunctionBegin; 4618 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4619 PetscValidPointer(a, 2); 4620 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4621 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4622 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4623 *a = NULL; 4624 PetscFunctionReturn(0); 4625 } 4626 4627 /*@C 4628 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4629 4630 Not Collective 4631 4632 Input Parameter: 4633 . A - a `MATSEQAIJCUSPARSE` matrix 4634 4635 Output Parameter: 4636 . a - pointer to the device data 4637 4638 Level: developer 4639 4640 Note: 4641 Does not trigger host-device copies and flags data validity on the GPU 4642 4643 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()` 4644 @*/ 4645 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a) 4646 { 4647 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4648 CsrMatrix *csr; 4649 4650 PetscFunctionBegin; 4651 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4652 PetscValidPointer(a, 2); 4653 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4654 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4655 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4656 csr = (CsrMatrix *)cusp->mat->mat; 4657 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4658 *a = csr->values->data().get(); 4659 A->offloadmask = PETSC_OFFLOAD_GPU; 4660 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4661 PetscFunctionReturn(0); 4662 } 4663 4664 /*@C 4665 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()` 4666 4667 Not Collective 4668 4669 Input Parameter: 4670 . A - a `MATSEQAIJCUSPARSE` matrix 4671 4672 Output Parameter: 4673 . a - pointer to the device data 4674 4675 Level: developer 4676 4677 .seealso: `MatSeqAIJCUSPARSEGetArrayWrite()` 4678 @*/ 4679 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a) 4680 { 4681 PetscFunctionBegin; 4682 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4683 PetscValidPointer(a, 2); 4684 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4685 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4686 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4687 *a = NULL; 4688 PetscFunctionReturn(0); 4689 } 4690 4691 struct IJCompare4 { 4692 __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4693 { 4694 if (t1.get<0>() < t2.get<0>()) return true; 4695 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4696 return false; 4697 } 4698 }; 4699 4700 struct Shift { 4701 int _shift; 4702 4703 Shift(int shift) : _shift(shift) { } 4704 __host__ __device__ inline int operator()(const int &c) { return c + _shift; } 4705 }; 4706 4707 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 4708 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 4709 { 4710 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 4711 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp; 4712 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4713 CsrMatrix *Acsr, *Bcsr, *Ccsr; 4714 PetscInt Annz, Bnnz; 4715 cusparseStatus_t stat; 4716 PetscInt i, m, n, zero = 0; 4717 4718 PetscFunctionBegin; 4719 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4720 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 4721 PetscValidPointer(C, 4); 4722 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4723 PetscCheckTypeName(B, MATSEQAIJCUSPARSE); 4724 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 4725 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 4726 PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4727 PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4728 if (reuse == MAT_INITIAL_MATRIX) { 4729 m = A->rmap->n; 4730 n = A->cmap->n + B->cmap->n; 4731 PetscCall(MatCreate(PETSC_COMM_SELF, C)); 4732 PetscCall(MatSetSizes(*C, m, n, m, n)); 4733 PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE)); 4734 c = (Mat_SeqAIJ *)(*C)->data; 4735 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4736 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4737 Ccsr = new CsrMatrix; 4738 Cmat->cprowIndices = NULL; 4739 c->compressedrow.use = PETSC_FALSE; 4740 c->compressedrow.nrows = 0; 4741 c->compressedrow.i = NULL; 4742 c->compressedrow.rindex = NULL; 4743 Ccusp->workVector = NULL; 4744 Ccusp->nrows = m; 4745 Ccusp->mat = Cmat; 4746 Ccusp->mat->mat = Ccsr; 4747 Ccsr->num_rows = m; 4748 Ccsr->num_cols = n; 4749 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 4750 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 4751 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4752 PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar))); 4753 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar))); 4754 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 4755 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4756 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4757 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4758 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4759 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4760 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4761 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4762 4763 Acsr = (CsrMatrix *)Acusp->mat->mat; 4764 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4765 Annz = (PetscInt)Acsr->column_indices->size(); 4766 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4767 c->nz = Annz + Bnnz; 4768 Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1); 4769 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4770 Ccsr->values = new THRUSTARRAY(c->nz); 4771 Ccsr->num_entries = c->nz; 4772 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4773 if (c->nz) { 4774 auto Acoo = new THRUSTINTARRAY32(Annz); 4775 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4776 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4777 THRUSTINTARRAY32 *Aroff, *Broff; 4778 4779 if (a->compressedrow.use) { /* need full row offset */ 4780 if (!Acusp->rowoffsets_gpu) { 4781 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4782 Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 4783 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4784 } 4785 Aroff = Acusp->rowoffsets_gpu; 4786 } else Aroff = Acsr->row_offsets; 4787 if (b->compressedrow.use) { /* need full row offset */ 4788 if (!Bcusp->rowoffsets_gpu) { 4789 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4790 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 4791 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 4792 } 4793 Broff = Bcusp->rowoffsets_gpu; 4794 } else Broff = Bcsr->row_offsets; 4795 PetscCall(PetscLogGpuTimeBegin()); 4796 stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4797 PetscCallCUSPARSE(stat); 4798 stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4799 PetscCallCUSPARSE(stat); 4800 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4801 auto Aperm = thrust::make_constant_iterator(1); 4802 auto Bperm = thrust::make_constant_iterator(0); 4803 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0) 4804 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n)); 4805 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n)); 4806 #else 4807 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4808 auto Bcib = Bcsr->column_indices->begin(); 4809 auto Bcie = Bcsr->column_indices->end(); 4810 thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n)); 4811 #endif 4812 auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz); 4813 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm)); 4814 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm)); 4815 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm)); 4816 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm)); 4817 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin())); 4818 auto p1 = Ccusp->cooPerm->begin(); 4819 auto p2 = Ccusp->cooPerm->begin(); 4820 thrust::advance(p2, Annz); 4821 PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4())); 4822 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0) 4823 thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n)); 4824 #endif 4825 auto cci = thrust::make_counting_iterator(zero); 4826 auto cce = thrust::make_counting_iterator(c->nz); 4827 #if 0 //Errors on SUMMIT cuda 11.1.0 4828 PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4829 #else 4830 auto pred = thrust::identity<int>(); 4831 PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred)); 4832 PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred)); 4833 #endif 4834 stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4835 PetscCallCUSPARSE(stat); 4836 PetscCall(PetscLogGpuTimeEnd()); 4837 delete wPerm; 4838 delete Acoo; 4839 delete Bcoo; 4840 delete Ccoo; 4841 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4842 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 4843 PetscCallCUSPARSE(stat); 4844 #endif 4845 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4846 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 4847 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 4848 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4849 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4850 CsrMatrix *CcsrT = new CsrMatrix; 4851 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4852 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4853 4854 (*C)->form_explicit_transpose = PETSC_TRUE; 4855 (*C)->transupdated = PETSC_TRUE; 4856 Ccusp->rowoffsets_gpu = NULL; 4857 CmatT->cprowIndices = NULL; 4858 CmatT->mat = CcsrT; 4859 CcsrT->num_rows = n; 4860 CcsrT->num_cols = m; 4861 CcsrT->num_entries = c->nz; 4862 4863 CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1); 4864 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4865 CcsrT->values = new THRUSTARRAY(c->nz); 4866 4867 PetscCall(PetscLogGpuTimeBegin()); 4868 auto rT = CcsrT->row_offsets->begin(); 4869 if (AT) { 4870 rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT); 4871 thrust::advance(rT, -1); 4872 } 4873 if (BT) { 4874 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz)); 4875 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz)); 4876 thrust::copy(titb, tite, rT); 4877 } 4878 auto cT = CcsrT->column_indices->begin(); 4879 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT); 4880 if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT); 4881 auto vT = CcsrT->values->begin(); 4882 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4883 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 4884 PetscCall(PetscLogGpuTimeEnd()); 4885 4886 PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr)); 4887 PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO)); 4888 PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4889 PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one), sizeof(PetscScalar))); 4890 PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero), sizeof(PetscScalar))); 4891 PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar))); 4892 PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4893 PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4894 PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4895 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4896 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 4897 PetscCallCUSPARSE(stat); 4898 #endif 4899 Ccusp->matTranspose = CmatT; 4900 } 4901 } 4902 4903 c->singlemalloc = PETSC_FALSE; 4904 c->free_a = PETSC_TRUE; 4905 c->free_ij = PETSC_TRUE; 4906 PetscCall(PetscMalloc1(m + 1, &c->i)); 4907 PetscCall(PetscMalloc1(c->nz, &c->j)); 4908 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4909 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4910 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4911 ii = *Ccsr->row_offsets; 4912 jj = *Ccsr->column_indices; 4913 PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4914 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4915 } else { 4916 PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4917 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4918 } 4919 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 4920 PetscCall(PetscMalloc1(m, &c->ilen)); 4921 PetscCall(PetscMalloc1(m, &c->imax)); 4922 c->maxnz = c->nz; 4923 c->nonzerorowcnt = 0; 4924 c->rmax = 0; 4925 for (i = 0; i < m; i++) { 4926 const PetscInt nn = c->i[i + 1] - c->i[i]; 4927 c->ilen[i] = c->imax[i] = nn; 4928 c->nonzerorowcnt += (PetscInt) !!nn; 4929 c->rmax = PetscMax(c->rmax, nn); 4930 } 4931 PetscCall(MatMarkDiagonal_SeqAIJ(*C)); 4932 PetscCall(PetscMalloc1(c->nz, &c->a)); 4933 (*C)->nonzerostate++; 4934 PetscCall(PetscLayoutSetUp((*C)->rmap)); 4935 PetscCall(PetscLayoutSetUp((*C)->cmap)); 4936 Ccusp->nonzerostate = (*C)->nonzerostate; 4937 (*C)->preallocated = PETSC_TRUE; 4938 } else { 4939 PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n); 4940 c = (Mat_SeqAIJ *)(*C)->data; 4941 if (c->nz) { 4942 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4943 PetscCheck(Ccusp->cooPerm, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cooPerm"); 4944 PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4945 PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate"); 4946 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4947 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4948 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4949 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4950 Acsr = (CsrMatrix *)Acusp->mat->mat; 4951 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4952 Ccsr = (CsrMatrix *)Ccusp->mat->mat; 4953 PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size()); 4954 PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size()); 4955 PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size()); 4956 PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries); 4957 PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->cooPerm->size(), (PetscInt)Ccsr->values->size()); 4958 auto pmid = Ccusp->cooPerm->begin(); 4959 thrust::advance(pmid, Acsr->num_entries); 4960 PetscCall(PetscLogGpuTimeBegin()); 4961 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->begin()))); 4962 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4963 thrust::for_each(zibait, zieait, VecCUDAEquals()); 4964 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4965 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->end()))); 4966 thrust::for_each(zibbit, ziebit, VecCUDAEquals()); 4967 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE)); 4968 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4969 PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4970 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4971 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4972 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4973 CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat; 4974 auto vT = CcsrT->values->begin(); 4975 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4976 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 4977 (*C)->transupdated = PETSC_TRUE; 4978 } 4979 PetscCall(PetscLogGpuTimeEnd()); 4980 } 4981 } 4982 PetscCall(PetscObjectStateIncrease((PetscObject)*C)); 4983 (*C)->assembled = PETSC_TRUE; 4984 (*C)->was_assembled = PETSC_FALSE; 4985 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4986 PetscFunctionReturn(0); 4987 } 4988 4989 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4990 { 4991 bool dmem; 4992 const PetscScalar *av; 4993 4994 PetscFunctionBegin; 4995 dmem = isCudaMem(v); 4996 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av)); 4997 if (n && idx) { 4998 THRUSTINTARRAY widx(n); 4999 widx.assign(idx, idx + n); 5000 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 5001 5002 THRUSTARRAY *w = NULL; 5003 thrust::device_ptr<PetscScalar> dv; 5004 if (dmem) { 5005 dv = thrust::device_pointer_cast(v); 5006 } else { 5007 w = new THRUSTARRAY(n); 5008 dv = w->data(); 5009 } 5010 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 5011 5012 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv)); 5013 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n)); 5014 thrust::for_each(zibit, zieit, VecCUDAEquals()); 5015 if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 5016 delete w; 5017 } else { 5018 PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost)); 5019 } 5020 if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 5021 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av)); 5022 PetscFunctionReturn(0); 5023 } 5024