1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 7 8 #include <petscconf.h> 9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10 #include <../src/mat/impls/sbaij/seq/sbaij.h> 11 #include <../src/vec/vec/impls/dvecimpl.h> 12 #include <petsc/private/vecimpl.h> 13 #undef VecType 14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15 #include <thrust/adjacent_difference.h> 16 #include <thrust/async/for_each.h> 17 #include <thrust/iterator/constant_iterator.h> 18 #include <thrust/remove.h> 19 #include <thrust/sort.h> 20 #include <thrust/unique.h> 21 22 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0}; 23 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 24 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 25 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 26 27 typedef enum { 28 CUSPARSE_MV_ALG_DEFAULT = 0, 29 CUSPARSE_COOMV_ALG = 1, 30 CUSPARSE_CSRMV_ALG1 = 2, 31 CUSPARSE_CSRMV_ALG2 = 3 32 } cusparseSpMVAlg_t; 33 34 typedef enum { 35 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 36 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 37 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 38 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 39 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 40 CUSPARSE_SPMM_ALG_DEFAULT = 0, 41 CUSPARSE_SPMM_COO_ALG1 = 1, 42 CUSPARSE_SPMM_COO_ALG2 = 2, 43 CUSPARSE_SPMM_COO_ALG3 = 3, 44 CUSPARSE_SPMM_COO_ALG4 = 5, 45 CUSPARSE_SPMM_CSR_ALG1 = 4, 46 CUSPARSE_SPMM_CSR_ALG2 = 6, 47 } cusparseSpMMAlg_t; 48 49 typedef enum { 50 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 51 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 52 } cusparseCsr2CscAlg_t; 53 */ 54 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0}; 55 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0}; 56 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0}; 57 #endif 58 59 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 60 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 61 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *); 62 63 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *); 64 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *); 65 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *); 66 67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec); 68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 71 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject); 72 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure); 73 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar); 74 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec); 75 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 76 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 77 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 78 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 79 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 80 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool); 81 82 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **); 83 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **); 84 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat); 85 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **); 86 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **); 87 88 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 89 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool); 90 91 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]); 92 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]); 93 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode); 94 95 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) { 96 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 97 98 PetscFunctionBegin; 99 switch (op) { 100 case MAT_CUSPARSE_MULT: cusparsestruct->format = format; break; 101 case MAT_CUSPARSE_ALL: cusparsestruct->format = format; break; 102 default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op); 103 } 104 PetscFunctionReturn(0); 105 } 106 107 /*@ 108 MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular 109 operation. Only the `MatMult()` operation can use different GPU storage formats 110 111 Not Collective 112 113 Input Parameters: 114 + A - Matrix of type `MATSEQAIJCUSPARSE` 115 . op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`. `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`, 116 `MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`. 117 - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.) 118 119 Output Parameter: 120 121 Level: intermediate 122 123 .seealso: `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 124 @*/ 125 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) { 126 PetscFunctionBegin; 127 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 128 PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format)); 129 PetscFunctionReturn(0); 130 } 131 132 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu) { 133 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 134 135 PetscFunctionBegin; 136 cusparsestruct->use_cpu_solve = use_cpu; 137 PetscFunctionReturn(0); 138 } 139 140 /*@ 141 MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`. 142 143 Input Parameters: 144 + A - Matrix of type `MATSEQAIJCUSPARSE` 145 - use_cpu - set flag for using the built-in CPU `MatSolve()` 146 147 Output Parameter: 148 149 Note: 150 The cuSparse LU solver currently computes the factors with the built-in CPU method 151 and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 152 This method to specify if the solve is done on the CPU or GPU (GPU is the default). 153 154 Level: intermediate 155 156 .seealso: `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 157 @*/ 158 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu) { 159 PetscFunctionBegin; 160 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 161 PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu)); 162 PetscFunctionReturn(0); 163 } 164 165 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg) { 166 PetscFunctionBegin; 167 switch (op) { 168 case MAT_FORM_EXPLICIT_TRANSPOSE: 169 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 170 if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 171 A->form_explicit_transpose = flg; 172 break; 173 default: PetscCall(MatSetOption_SeqAIJ(A, op, flg)); break; 174 } 175 PetscFunctionReturn(0); 176 } 177 178 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 179 180 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) { 181 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 182 IS isrow = b->row, iscol = b->col; 183 PetscBool row_identity, col_identity; 184 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)B->spptr; 185 186 PetscFunctionBegin; 187 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 188 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 189 B->offloadmask = PETSC_OFFLOAD_CPU; 190 /* determine which version of MatSolve needs to be used. */ 191 PetscCall(ISIdentity(isrow, &row_identity)); 192 PetscCall(ISIdentity(iscol, &col_identity)); 193 194 if (!cusparsestruct->use_cpu_solve) { 195 if (row_identity && col_identity) { 196 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 197 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 198 } else { 199 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 200 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 201 } 202 } 203 B->ops->matsolve = NULL; 204 B->ops->matsolvetranspose = NULL; 205 206 /* get the triangular factors */ 207 if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B)); 208 PetscFunctionReturn(0); 209 } 210 211 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject) { 212 MatCUSPARSEStorageFormat format; 213 PetscBool flg; 214 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 215 216 PetscFunctionBegin; 217 PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options"); 218 if (A->factortype == MAT_FACTOR_NONE) { 219 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 220 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format)); 221 222 PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 223 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format)); 224 PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg)); 225 if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve)); 226 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 227 PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg)); 228 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 229 #if CUSPARSE_VERSION > 11301 230 PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 231 #else 232 PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 233 #endif 234 PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg)); 235 PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 236 237 PetscCall( 238 PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg)); 239 PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 240 #endif 241 } 242 PetscOptionsHeadEnd(); 243 PetscFunctionReturn(0); 244 } 245 246 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) { 247 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 248 PetscInt n = A->rmap->n; 249 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 250 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 251 const PetscInt *ai = a->i, *aj = a->j, *vi; 252 const MatScalar *aa = a->a, *v; 253 PetscInt *AiLo, *AjLo; 254 PetscInt i, nz, nzLower, offset, rowOffset; 255 256 PetscFunctionBegin; 257 if (!n) PetscFunctionReturn(0); 258 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 259 try { 260 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 261 nzLower = n + ai[n] - ai[1]; 262 if (!loTriFactor) { 263 PetscScalar *AALo; 264 265 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar))); 266 267 /* Allocate Space for the lower triangular matrix */ 268 PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt))); 269 PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt))); 270 271 /* Fill the lower triangular matrix */ 272 AiLo[0] = (PetscInt)0; 273 AiLo[n] = nzLower; 274 AjLo[0] = (PetscInt)0; 275 AALo[0] = (MatScalar)1.0; 276 v = aa; 277 vi = aj; 278 offset = 1; 279 rowOffset = 1; 280 for (i = 1; i < n; i++) { 281 nz = ai[i + 1] - ai[i]; 282 /* additional 1 for the term on the diagonal */ 283 AiLo[i] = rowOffset; 284 rowOffset += nz + 1; 285 286 PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz)); 287 PetscCall(PetscArraycpy(&(AALo[offset]), v, nz)); 288 289 offset += nz; 290 AjLo[offset] = (PetscInt)i; 291 AALo[offset] = (MatScalar)1.0; 292 offset += 1; 293 294 v += nz; 295 vi += nz; 296 } 297 298 /* allocate space for the triangular factor information */ 299 PetscCall(PetscNew(&loTriFactor)); 300 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 301 /* Create the matrix description */ 302 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 303 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 304 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 305 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 306 #else 307 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 308 #endif 309 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER)); 310 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 311 312 /* set the operation */ 313 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 314 315 /* set the matrix */ 316 loTriFactor->csrMat = new CsrMatrix; 317 loTriFactor->csrMat->num_rows = n; 318 loTriFactor->csrMat->num_cols = n; 319 loTriFactor->csrMat->num_entries = nzLower; 320 321 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 322 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1); 323 324 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 325 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower); 326 327 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 328 loTriFactor->csrMat->values->assign(AALo, AALo + nzLower); 329 330 /* Create the solve analysis information */ 331 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 332 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 333 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 334 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 335 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 336 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 337 #endif 338 339 /* perform the solve analysis */ 340 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 341 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), 342 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 343 loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 344 #else 345 loTriFactor->solveInfo)); 346 #endif 347 PetscCallCUDA(WaitForCUDA()); 348 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 349 350 /* assign the pointer */ 351 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 352 loTriFactor->AA_h = AALo; 353 PetscCallCUDA(cudaFreeHost(AiLo)); 354 PetscCallCUDA(cudaFreeHost(AjLo)); 355 PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar))); 356 } else { /* update values only */ 357 if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar))); 358 /* Fill the lower triangular matrix */ 359 loTriFactor->AA_h[0] = 1.0; 360 v = aa; 361 vi = aj; 362 offset = 1; 363 for (i = 1; i < n; i++) { 364 nz = ai[i + 1] - ai[i]; 365 PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz)); 366 offset += nz; 367 loTriFactor->AA_h[offset] = 1.0; 368 offset += 1; 369 v += nz; 370 } 371 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower); 372 PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar))); 373 } 374 } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); } 375 } 376 PetscFunctionReturn(0); 377 } 378 379 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) { 380 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 381 PetscInt n = A->rmap->n; 382 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 383 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 384 const PetscInt *aj = a->j, *adiag = a->diag, *vi; 385 const MatScalar *aa = a->a, *v; 386 PetscInt *AiUp, *AjUp; 387 PetscInt i, nz, nzUpper, offset; 388 389 PetscFunctionBegin; 390 if (!n) PetscFunctionReturn(0); 391 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 392 try { 393 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 394 nzUpper = adiag[0] - adiag[n]; 395 if (!upTriFactor) { 396 PetscScalar *AAUp; 397 398 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 399 400 /* Allocate Space for the upper triangular matrix */ 401 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 402 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 403 404 /* Fill the upper triangular matrix */ 405 AiUp[0] = (PetscInt)0; 406 AiUp[n] = nzUpper; 407 offset = nzUpper; 408 for (i = n - 1; i >= 0; i--) { 409 v = aa + adiag[i + 1] + 1; 410 vi = aj + adiag[i + 1] + 1; 411 412 /* number of elements NOT on the diagonal */ 413 nz = adiag[i] - adiag[i + 1] - 1; 414 415 /* decrement the offset */ 416 offset -= (nz + 1); 417 418 /* first, set the diagonal elements */ 419 AjUp[offset] = (PetscInt)i; 420 AAUp[offset] = (MatScalar)1. / v[nz]; 421 AiUp[i] = AiUp[i + 1] - (nz + 1); 422 423 PetscCall(PetscArraycpy(&(AjUp[offset + 1]), vi, nz)); 424 PetscCall(PetscArraycpy(&(AAUp[offset + 1]), v, nz)); 425 } 426 427 /* allocate space for the triangular factor information */ 428 PetscCall(PetscNew(&upTriFactor)); 429 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 430 431 /* Create the matrix description */ 432 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 433 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 434 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 435 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 436 #else 437 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 438 #endif 439 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 440 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 441 442 /* set the operation */ 443 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 444 445 /* set the matrix */ 446 upTriFactor->csrMat = new CsrMatrix; 447 upTriFactor->csrMat->num_rows = n; 448 upTriFactor->csrMat->num_cols = n; 449 upTriFactor->csrMat->num_entries = nzUpper; 450 451 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 452 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1); 453 454 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 455 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper); 456 457 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 458 upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper); 459 460 /* Create the solve analysis information */ 461 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 462 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 463 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 464 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 465 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 466 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 467 #endif 468 469 /* perform the solve analysis */ 470 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 471 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), 472 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 473 upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 474 #else 475 upTriFactor->solveInfo)); 476 #endif 477 PetscCallCUDA(WaitForCUDA()); 478 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 479 480 /* assign the pointer */ 481 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 482 upTriFactor->AA_h = AAUp; 483 PetscCallCUDA(cudaFreeHost(AiUp)); 484 PetscCallCUDA(cudaFreeHost(AjUp)); 485 PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar))); 486 } else { 487 if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar))); 488 /* Fill the upper triangular matrix */ 489 offset = nzUpper; 490 for (i = n - 1; i >= 0; i--) { 491 v = aa + adiag[i + 1] + 1; 492 493 /* number of elements NOT on the diagonal */ 494 nz = adiag[i] - adiag[i + 1] - 1; 495 496 /* decrement the offset */ 497 offset -= (nz + 1); 498 499 /* first, set the diagonal elements */ 500 upTriFactor->AA_h[offset] = 1. / v[nz]; 501 PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset + 1]), v, nz)); 502 } 503 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper); 504 PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar))); 505 } 506 } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); } 507 } 508 PetscFunctionReturn(0); 509 } 510 511 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) { 512 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 513 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 514 IS isrow = a->row, iscol = a->icol; 515 PetscBool row_identity, col_identity; 516 PetscInt n = A->rmap->n; 517 518 PetscFunctionBegin; 519 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 520 PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A)); 521 PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A)); 522 523 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 524 cusparseTriFactors->nnz = a->nz; 525 526 A->offloadmask = PETSC_OFFLOAD_BOTH; 527 /* lower triangular indices */ 528 PetscCall(ISIdentity(isrow, &row_identity)); 529 if (!row_identity && !cusparseTriFactors->rpermIndices) { 530 const PetscInt *r; 531 532 PetscCall(ISGetIndices(isrow, &r)); 533 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 534 cusparseTriFactors->rpermIndices->assign(r, r + n); 535 PetscCall(ISRestoreIndices(isrow, &r)); 536 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 537 } 538 539 /* upper triangular indices */ 540 PetscCall(ISIdentity(iscol, &col_identity)); 541 if (!col_identity && !cusparseTriFactors->cpermIndices) { 542 const PetscInt *c; 543 544 PetscCall(ISGetIndices(iscol, &c)); 545 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 546 cusparseTriFactors->cpermIndices->assign(c, c + n); 547 PetscCall(ISRestoreIndices(iscol, &c)); 548 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 549 } 550 PetscFunctionReturn(0); 551 } 552 553 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) { 554 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 555 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 556 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 557 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 558 PetscInt *AiUp, *AjUp; 559 PetscScalar *AAUp; 560 PetscScalar *AALo; 561 PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j; 562 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data; 563 const PetscInt *ai = b->i, *aj = b->j, *vj; 564 const MatScalar *aa = b->a, *v; 565 566 PetscFunctionBegin; 567 if (!n) PetscFunctionReturn(0); 568 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 569 try { 570 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 571 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar))); 572 if (!upTriFactor && !loTriFactor) { 573 /* Allocate Space for the upper triangular matrix */ 574 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 575 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 576 577 /* Fill the upper triangular matrix */ 578 AiUp[0] = (PetscInt)0; 579 AiUp[n] = nzUpper; 580 offset = 0; 581 for (i = 0; i < n; i++) { 582 /* set the pointers */ 583 v = aa + ai[i]; 584 vj = aj + ai[i]; 585 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 586 587 /* first, set the diagonal elements */ 588 AjUp[offset] = (PetscInt)i; 589 AAUp[offset] = (MatScalar)1.0 / v[nz]; 590 AiUp[i] = offset; 591 AALo[offset] = (MatScalar)1.0 / v[nz]; 592 593 offset += 1; 594 if (nz > 0) { 595 PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz)); 596 PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 597 for (j = offset; j < offset + nz; j++) { 598 AAUp[j] = -AAUp[j]; 599 AALo[j] = AAUp[j] / v[nz]; 600 } 601 offset += nz; 602 } 603 } 604 605 /* allocate space for the triangular factor information */ 606 PetscCall(PetscNew(&upTriFactor)); 607 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 608 609 /* Create the matrix description */ 610 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 611 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 612 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 613 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 614 #else 615 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 616 #endif 617 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 618 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 619 620 /* set the matrix */ 621 upTriFactor->csrMat = new CsrMatrix; 622 upTriFactor->csrMat->num_rows = A->rmap->n; 623 upTriFactor->csrMat->num_cols = A->cmap->n; 624 upTriFactor->csrMat->num_entries = a->nz; 625 626 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 627 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 628 629 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 630 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 631 632 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 633 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 634 635 /* set the operation */ 636 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 637 638 /* Create the solve analysis information */ 639 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 640 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 641 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 642 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 643 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 644 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 645 #endif 646 647 /* perform the solve analysis */ 648 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 649 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), 650 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 651 upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 652 #else 653 upTriFactor->solveInfo)); 654 #endif 655 PetscCallCUDA(WaitForCUDA()); 656 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 657 658 /* assign the pointer */ 659 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 660 661 /* allocate space for the triangular factor information */ 662 PetscCall(PetscNew(&loTriFactor)); 663 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 664 665 /* Create the matrix description */ 666 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 667 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 668 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 669 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 670 #else 671 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 672 #endif 673 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 674 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 675 676 /* set the operation */ 677 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 678 679 /* set the matrix */ 680 loTriFactor->csrMat = new CsrMatrix; 681 loTriFactor->csrMat->num_rows = A->rmap->n; 682 loTriFactor->csrMat->num_cols = A->cmap->n; 683 loTriFactor->csrMat->num_entries = a->nz; 684 685 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 686 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 687 688 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 689 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 690 691 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 692 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 693 694 /* Create the solve analysis information */ 695 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 696 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 697 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 698 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 699 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 700 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 701 #endif 702 703 /* perform the solve analysis */ 704 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 705 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), 706 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 707 loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 708 #else 709 loTriFactor->solveInfo)); 710 #endif 711 PetscCallCUDA(WaitForCUDA()); 712 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 713 714 /* assign the pointer */ 715 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 716 717 PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar)))); 718 PetscCallCUDA(cudaFreeHost(AiUp)); 719 PetscCallCUDA(cudaFreeHost(AjUp)); 720 } else { 721 /* Fill the upper triangular matrix */ 722 offset = 0; 723 for (i = 0; i < n; i++) { 724 /* set the pointers */ 725 v = aa + ai[i]; 726 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 727 728 /* first, set the diagonal elements */ 729 AAUp[offset] = 1.0 / v[nz]; 730 AALo[offset] = 1.0 / v[nz]; 731 732 offset += 1; 733 if (nz > 0) { 734 PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 735 for (j = offset; j < offset + nz; j++) { 736 AAUp[j] = -AAUp[j]; 737 AALo[j] = AAUp[j] / v[nz]; 738 } 739 offset += nz; 740 } 741 } 742 PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 743 PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 744 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 745 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 746 PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar))); 747 } 748 PetscCallCUDA(cudaFreeHost(AAUp)); 749 PetscCallCUDA(cudaFreeHost(AALo)); 750 } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); } 751 } 752 PetscFunctionReturn(0); 753 } 754 755 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) { 756 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 757 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 758 IS ip = a->row; 759 PetscBool perm_identity; 760 PetscInt n = A->rmap->n; 761 762 PetscFunctionBegin; 763 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 764 PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A)); 765 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 766 cusparseTriFactors->nnz = (a->nz - n) * 2 + n; 767 768 A->offloadmask = PETSC_OFFLOAD_BOTH; 769 770 /* lower triangular indices */ 771 PetscCall(ISIdentity(ip, &perm_identity)); 772 if (!perm_identity) { 773 IS iip; 774 const PetscInt *irip, *rip; 775 776 PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip)); 777 PetscCall(ISGetIndices(iip, &irip)); 778 PetscCall(ISGetIndices(ip, &rip)); 779 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 780 cusparseTriFactors->rpermIndices->assign(rip, rip + n); 781 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 782 cusparseTriFactors->cpermIndices->assign(irip, irip + n); 783 PetscCall(ISRestoreIndices(iip, &irip)); 784 PetscCall(ISDestroy(&iip)); 785 PetscCall(ISRestoreIndices(ip, &rip)); 786 PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt))); 787 } 788 PetscFunctionReturn(0); 789 } 790 791 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) { 792 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 793 IS ip = b->row; 794 PetscBool perm_identity; 795 796 PetscFunctionBegin; 797 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 798 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 799 B->offloadmask = PETSC_OFFLOAD_CPU; 800 /* determine which version of MatSolve needs to be used. */ 801 PetscCall(ISIdentity(ip, &perm_identity)); 802 if (perm_identity) { 803 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 804 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 805 B->ops->matsolve = NULL; 806 B->ops->matsolvetranspose = NULL; 807 } else { 808 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 809 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 810 B->ops->matsolve = NULL; 811 B->ops->matsolvetranspose = NULL; 812 } 813 814 /* get the triangular factors */ 815 PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B)); 816 PetscFunctionReturn(0); 817 } 818 819 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) { 820 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 821 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 822 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 823 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 824 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 825 cusparseIndexBase_t indexBase; 826 cusparseMatrixType_t matrixType; 827 cusparseFillMode_t fillMode; 828 cusparseDiagType_t diagType; 829 830 PetscFunctionBegin; 831 /* allocate space for the transpose of the lower triangular factor */ 832 PetscCall(PetscNew(&loTriFactorT)); 833 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 834 835 /* set the matrix descriptors of the lower triangular factor */ 836 matrixType = cusparseGetMatType(loTriFactor->descr); 837 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 838 fillMode = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 839 diagType = cusparseGetMatDiagType(loTriFactor->descr); 840 841 /* Create the matrix description */ 842 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr)); 843 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 844 PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType)); 845 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode)); 846 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType)); 847 848 /* set the operation */ 849 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 850 851 /* allocate GPU space for the CSC of the lower triangular factor*/ 852 loTriFactorT->csrMat = new CsrMatrix; 853 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 854 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 855 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 856 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1); 857 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 858 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 859 860 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 861 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 862 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), 863 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 864 loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 865 PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize)); 866 #endif 867 868 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 869 PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 870 loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), 871 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 872 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer)); 873 #else 874 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase)); 875 #endif 876 PetscCallCUDA(WaitForCUDA()); 877 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 878 879 /* Create the solve analysis information */ 880 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 881 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo)); 882 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 883 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 884 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize)); 885 PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize)); 886 #endif 887 888 /* perform the solve analysis */ 889 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 890 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 891 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 892 loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 893 #else 894 loTriFactorT->solveInfo)); 895 #endif 896 PetscCallCUDA(WaitForCUDA()); 897 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 898 899 /* assign the pointer */ 900 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 901 902 /*********************************************/ 903 /* Now the Transpose of the Upper Tri Factor */ 904 /*********************************************/ 905 906 /* allocate space for the transpose of the upper triangular factor */ 907 PetscCall(PetscNew(&upTriFactorT)); 908 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 909 910 /* set the matrix descriptors of the upper triangular factor */ 911 matrixType = cusparseGetMatType(upTriFactor->descr); 912 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 913 fillMode = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 914 diagType = cusparseGetMatDiagType(upTriFactor->descr); 915 916 /* Create the matrix description */ 917 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr)); 918 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 919 PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType)); 920 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode)); 921 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType)); 922 923 /* set the operation */ 924 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 925 926 /* allocate GPU space for the CSC of the upper triangular factor*/ 927 upTriFactorT->csrMat = new CsrMatrix; 928 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 929 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 930 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 931 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1); 932 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 933 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 934 935 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 936 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 937 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), 938 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 939 upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 940 PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize)); 941 #endif 942 943 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 944 PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 945 upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), 946 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 947 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer)); 948 #else 949 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase)); 950 #endif 951 952 PetscCallCUDA(WaitForCUDA()); 953 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 954 955 /* Create the solve analysis information */ 956 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 957 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo)); 958 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 959 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 960 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize)); 961 PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize)); 962 #endif 963 964 /* perform the solve analysis */ 965 /* christ, would it have killed you to put this stuff in a function????????? */ 966 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 967 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 968 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 969 upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 970 #else 971 upTriFactorT->solveInfo)); 972 #endif 973 974 PetscCallCUDA(WaitForCUDA()); 975 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 976 977 /* assign the pointer */ 978 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 979 PetscFunctionReturn(0); 980 } 981 982 struct PetscScalarToPetscInt { 983 __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); } 984 }; 985 986 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) { 987 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 988 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 989 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 990 cusparseStatus_t stat; 991 cusparseIndexBase_t indexBase; 992 993 PetscFunctionBegin; 994 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 995 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 996 PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct"); 997 matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 998 PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct"); 999 if (A->transupdated) PetscFunctionReturn(0); 1000 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1001 PetscCall(PetscLogGpuTimeBegin()); 1002 if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 1003 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1004 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1005 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr)); 1006 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1007 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase)); 1008 PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1009 1010 /* set alpha and beta */ 1011 PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar))); 1012 PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero), sizeof(PetscScalar))); 1013 PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar))); 1014 PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1015 PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1016 PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1017 1018 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1019 CsrMatrix *matrixT = new CsrMatrix; 1020 matstructT->mat = matrixT; 1021 matrixT->num_rows = A->cmap->n; 1022 matrixT->num_cols = A->rmap->n; 1023 matrixT->num_entries = a->nz; 1024 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1); 1025 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1026 matrixT->values = new THRUSTARRAY(a->nz); 1027 1028 if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1029 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1030 1031 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1032 #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1) 1033 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1034 indexBase, cusparse_scalartype); 1035 PetscCallCUSPARSE(stat); 1036 #else 1037 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 1038 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 1039 1040 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 1041 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 1042 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 1043 */ 1044 if (matrixT->num_entries) { 1045 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype); 1046 PetscCallCUSPARSE(stat); 1047 1048 } else { 1049 matstructT->matDescr = NULL; 1050 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 1051 } 1052 #endif 1053 #endif 1054 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1055 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1056 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1057 #else 1058 CsrMatrix *temp = new CsrMatrix; 1059 CsrMatrix *tempT = new CsrMatrix; 1060 /* First convert HYB to CSR */ 1061 temp->num_rows = A->rmap->n; 1062 temp->num_cols = A->cmap->n; 1063 temp->num_entries = a->nz; 1064 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1065 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1066 temp->values = new THRUSTARRAY(a->nz); 1067 1068 stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get()); 1069 PetscCallCUSPARSE(stat); 1070 1071 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1072 tempT->num_rows = A->rmap->n; 1073 tempT->num_cols = A->cmap->n; 1074 tempT->num_entries = a->nz; 1075 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1076 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1077 tempT->values = new THRUSTARRAY(a->nz); 1078 1079 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(), 1080 tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1081 PetscCallCUSPARSE(stat); 1082 1083 /* Last, convert CSC to HYB */ 1084 cusparseHybMat_t hybMat; 1085 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 1086 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1087 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition); 1088 PetscCallCUSPARSE(stat); 1089 1090 /* assign the pointer */ 1091 matstructT->mat = hybMat; 1092 A->transupdated = PETSC_TRUE; 1093 /* delete temporaries */ 1094 if (tempT) { 1095 if (tempT->values) delete (THRUSTARRAY *)tempT->values; 1096 if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices; 1097 if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets; 1098 delete (CsrMatrix *)tempT; 1099 } 1100 if (temp) { 1101 if (temp->values) delete (THRUSTARRAY *)temp->values; 1102 if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices; 1103 if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets; 1104 delete (CsrMatrix *)temp; 1105 } 1106 #endif 1107 } 1108 } 1109 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1110 CsrMatrix *matrix = (CsrMatrix *)matstruct->mat; 1111 CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat; 1112 PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix"); 1113 PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows"); 1114 PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols"); 1115 PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values"); 1116 PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT"); 1117 PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows"); 1118 PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols"); 1119 PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values"); 1120 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1121 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1122 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1123 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 1124 } 1125 if (!cusparsestruct->csr2csc_i) { 1126 THRUSTARRAY csr2csc_a(matrix->num_entries); 1127 PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1128 1129 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1130 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1131 void *csr2cscBuffer; 1132 size_t csr2cscBufferSize; 1133 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1134 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize); 1135 PetscCallCUSPARSE(stat); 1136 PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize)); 1137 #endif 1138 1139 if (matrix->num_entries) { 1140 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1141 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1142 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1143 1144 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1145 should be filled with indexBase. So I just take a shortcut here. 1146 */ 1147 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1148 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1149 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer); 1150 PetscCallCUSPARSE(stat); 1151 #else 1152 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1153 PetscCallCUSPARSE(stat); 1154 #endif 1155 } else { 1156 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 1157 } 1158 1159 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1160 PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt())); 1161 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1162 PetscCallCUDA(cudaFree(csr2cscBuffer)); 1163 #endif 1164 } 1165 PetscCallThrust( 1166 thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin())); 1167 } 1168 PetscCall(PetscLogGpuTimeEnd()); 1169 PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1170 /* the compressed row indices is not used for matTranspose */ 1171 matstructT->cprowIndices = NULL; 1172 /* assign the pointer */ 1173 ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT; 1174 A->transupdated = PETSC_TRUE; 1175 PetscFunctionReturn(0); 1176 } 1177 1178 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1179 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) { 1180 PetscInt n = xx->map->n; 1181 const PetscScalar *barray; 1182 PetscScalar *xarray; 1183 thrust::device_ptr<const PetscScalar> bGPU; 1184 thrust::device_ptr<PetscScalar> xGPU; 1185 cusparseStatus_t stat; 1186 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1187 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1188 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1189 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1190 1191 PetscFunctionBegin; 1192 /* Analyze the matrix and create the transpose ... on the fly */ 1193 if (!loTriFactorT && !upTriFactorT) { 1194 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1195 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1196 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1197 } 1198 1199 /* Get the GPU pointers */ 1200 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1201 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1202 xGPU = thrust::device_pointer_cast(xarray); 1203 bGPU = thrust::device_pointer_cast(barray); 1204 1205 PetscCall(PetscLogGpuTimeBegin()); 1206 /* First, reorder with the row permutation */ 1207 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU); 1208 1209 /* First, solve U */ 1210 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, 1211 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1212 upTriFactorT->csrMat->num_entries, 1213 #endif 1214 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, 1215 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1216 tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer); 1217 PetscCallCUSPARSE(stat); 1218 #else 1219 tempGPU->data().get()); 1220 PetscCallCUSPARSE(stat); 1221 #endif 1222 1223 /* Then, solve L */ 1224 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, 1225 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1226 loTriFactorT->csrMat->num_entries, 1227 #endif 1228 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1229 tempGPU->data().get(), 1230 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1231 xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer); 1232 PetscCallCUSPARSE(stat); 1233 #else 1234 xarray); 1235 PetscCallCUSPARSE(stat); 1236 #endif 1237 1238 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1239 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin()); 1240 1241 /* Copy the temporary to the full solution. */ 1242 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU); 1243 1244 /* restore */ 1245 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1246 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1247 PetscCall(PetscLogGpuTimeEnd()); 1248 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1249 PetscFunctionReturn(0); 1250 } 1251 1252 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) { 1253 const PetscScalar *barray; 1254 PetscScalar *xarray; 1255 cusparseStatus_t stat; 1256 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1257 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1258 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1259 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1260 1261 PetscFunctionBegin; 1262 /* Analyze the matrix and create the transpose ... on the fly */ 1263 if (!loTriFactorT && !upTriFactorT) { 1264 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1265 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1266 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1267 } 1268 1269 /* Get the GPU pointers */ 1270 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1271 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1272 1273 PetscCall(PetscLogGpuTimeBegin()); 1274 /* First, solve U */ 1275 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, 1276 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1277 upTriFactorT->csrMat->num_entries, 1278 #endif 1279 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, 1280 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1281 tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer); 1282 PetscCallCUSPARSE(stat); 1283 #else 1284 tempGPU->data().get()); 1285 PetscCallCUSPARSE(stat); 1286 #endif 1287 1288 /* Then, solve L */ 1289 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, 1290 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1291 loTriFactorT->csrMat->num_entries, 1292 #endif 1293 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1294 tempGPU->data().get(), 1295 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1296 xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer); 1297 PetscCallCUSPARSE(stat); 1298 #else 1299 xarray); 1300 PetscCallCUSPARSE(stat); 1301 #endif 1302 1303 /* restore */ 1304 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1305 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1306 PetscCall(PetscLogGpuTimeEnd()); 1307 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1308 PetscFunctionReturn(0); 1309 } 1310 1311 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) { 1312 const PetscScalar *barray; 1313 PetscScalar *xarray; 1314 thrust::device_ptr<const PetscScalar> bGPU; 1315 thrust::device_ptr<PetscScalar> xGPU; 1316 cusparseStatus_t stat; 1317 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1318 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1319 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1320 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1321 1322 PetscFunctionBegin; 1323 1324 /* Get the GPU pointers */ 1325 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1326 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1327 xGPU = thrust::device_pointer_cast(xarray); 1328 bGPU = thrust::device_pointer_cast(barray); 1329 1330 PetscCall(PetscLogGpuTimeBegin()); 1331 /* First, reorder with the row permutation */ 1332 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin()); 1333 1334 /* Next, solve L */ 1335 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, 1336 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1337 loTriFactor->csrMat->num_entries, 1338 #endif 1339 &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 1340 tempGPU->data().get(), 1341 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1342 xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer); 1343 PetscCallCUSPARSE(stat); 1344 #else 1345 xarray); 1346 PetscCallCUSPARSE(stat); 1347 #endif 1348 1349 /* Then, solve U */ 1350 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, 1351 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1352 upTriFactor->csrMat->num_entries, 1353 #endif 1354 &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, 1355 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1356 tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer); 1357 PetscCallCUSPARSE(stat); 1358 #else 1359 tempGPU->data().get()); 1360 PetscCallCUSPARSE(stat); 1361 #endif 1362 1363 /* Last, reorder with the column permutation */ 1364 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU); 1365 1366 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1367 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1368 PetscCall(PetscLogGpuTimeEnd()); 1369 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1370 PetscFunctionReturn(0); 1371 } 1372 1373 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) { 1374 const PetscScalar *barray; 1375 PetscScalar *xarray; 1376 cusparseStatus_t stat; 1377 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1378 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1379 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1380 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1381 1382 PetscFunctionBegin; 1383 /* Get the GPU pointers */ 1384 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1385 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1386 1387 PetscCall(PetscLogGpuTimeBegin()); 1388 /* First, solve L */ 1389 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, 1390 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1391 loTriFactor->csrMat->num_entries, 1392 #endif 1393 &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, 1394 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1395 tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer); 1396 PetscCallCUSPARSE(stat); 1397 #else 1398 tempGPU->data().get()); 1399 PetscCallCUSPARSE(stat); 1400 #endif 1401 1402 /* Next, solve U */ 1403 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, 1404 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1405 upTriFactor->csrMat->num_entries, 1406 #endif 1407 &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 1408 tempGPU->data().get(), 1409 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1410 xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer); 1411 PetscCallCUSPARSE(stat); 1412 #else 1413 xarray); 1414 PetscCallCUSPARSE(stat); 1415 #endif 1416 1417 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1418 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1419 PetscCall(PetscLogGpuTimeEnd()); 1420 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1421 PetscFunctionReturn(0); 1422 } 1423 1424 #if CUSPARSE_VERSION >= 11500 1425 /* cusparseSpSV_solve() and friends first appeared in cusparse-11.3 */ 1426 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x) { 1427 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1428 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1429 const PetscScalar *barray; 1430 PetscScalar *xarray; 1431 1432 PetscFunctionBegin; 1433 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1434 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1435 PetscCall(PetscLogGpuTimeBegin()); 1436 1437 /* Solve L*y = b */ 1438 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1439 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1440 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */ 1441 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, 1442 fs->spsvDescr_L)); // cusparseSpSV_solve() scretely uses the external buffer used in cusparseSpSV_analysis()! 1443 1444 /* Solve U*x = y */ 1445 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1446 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */ 1447 fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U)); 1448 1449 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1450 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1451 1452 PetscCall(PetscLogGpuTimeEnd()); 1453 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1454 PetscFunctionReturn(0); 1455 } 1456 1457 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x) { 1458 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1459 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1460 const PetscScalar *barray; 1461 PetscScalar *xarray; 1462 1463 PetscFunctionBegin; 1464 if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */ 1465 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 1466 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do tranpose solve with it */ 1467 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 1468 1469 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); 1470 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 1471 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 1472 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 1473 fs->createdTransposeSpSVDescr = PETSC_TRUE; 1474 } 1475 1476 if (!fs->updatedTransposeSpSVAnalysis) { 1477 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1478 1479 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 1480 fs->updatedTransposeSpSVAnalysis = PETSC_TRUE; 1481 } 1482 1483 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1484 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1485 PetscCall(PetscLogGpuTimeBegin()); 1486 1487 /* Solve Ut*y = b */ 1488 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1489 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1490 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */ 1491 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut)); 1492 1493 /* Solve Lt*x = y */ 1494 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1495 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */ 1496 fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt)); 1497 1498 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1499 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1500 PetscCall(PetscLogGpuTimeEnd()); 1501 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1502 PetscFunctionReturn(0); 1503 } 1504 1505 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *info) { 1506 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1507 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1508 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1509 CsrMatrix *Acsr; 1510 PetscInt m, nz; 1511 PetscBool flg; 1512 1513 PetscFunctionBegin; 1514 if (PetscDefined(USE_DEBUG)) { 1515 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1516 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1517 } 1518 1519 /* Copy A's value to fact */ 1520 m = fact->rmap->n; 1521 nz = aij->nz; 1522 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1523 Acsr = (CsrMatrix *)Acusp->mat->mat; 1524 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1525 1526 /* Factorize fact inplace */ 1527 if (m) 1528 PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1529 fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1530 if (PetscDefined(USE_DEBUG)) { 1531 int numerical_zero; 1532 cusparseStatus_t status; 1533 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero); 1534 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1535 } 1536 1537 /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02() 1538 See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78 1539 */ 1540 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1541 1542 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 1543 1544 /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */ 1545 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 1546 1547 fact->offloadmask = PETSC_OFFLOAD_GPU; 1548 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ILU0; 1549 fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_ILU0; 1550 fact->ops->matsolve = NULL; 1551 fact->ops->matsolvetranspose = NULL; 1552 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1553 PetscFunctionReturn(0); 1554 } 1555 1556 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) { 1557 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1558 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1559 PetscInt m, nz; 1560 1561 PetscFunctionBegin; 1562 if (PetscDefined(USE_DEBUG)) { 1563 PetscInt i; 1564 PetscBool flg, missing; 1565 1566 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1567 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1568 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 1569 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1570 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1571 } 1572 1573 /* Free the old stale stuff */ 1574 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 1575 1576 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1577 but they will not be used. Allocate them just for easy debugging. 1578 */ 1579 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 1580 1581 fact->offloadmask = PETSC_OFFLOAD_BOTH; 1582 fact->factortype = MAT_FACTOR_ILU; 1583 fact->info.factor_mallocs = 0; 1584 fact->info.fill_ratio_given = info->fill; 1585 fact->info.fill_ratio_needed = 1.0; 1586 1587 aij->row = NULL; 1588 aij->col = NULL; 1589 1590 /* ====================================================================== */ 1591 /* Copy A's i, j to fact and also allocate the value array of fact. */ 1592 /* We'll do in-place factorization on fact */ 1593 /* ====================================================================== */ 1594 const int *Ai, *Aj; 1595 1596 m = fact->rmap->n; 1597 nz = aij->nz; 1598 1599 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1))); 1600 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz)); 1601 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz)); 1602 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */ 1603 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1604 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1605 1606 /* ====================================================================== */ 1607 /* Create descriptors for M, L, U */ 1608 /* ====================================================================== */ 1609 cusparseFillMode_t fillMode; 1610 cusparseDiagType_t diagType; 1611 1612 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 1613 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 1614 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 1615 1616 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 1617 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1618 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1619 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1620 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1621 */ 1622 fillMode = CUSPARSE_FILL_MODE_LOWER; 1623 diagType = CUSPARSE_DIAG_TYPE_UNIT; 1624 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1625 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1626 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1627 1628 fillMode = CUSPARSE_FILL_MODE_UPPER; 1629 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 1630 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1631 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1632 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1633 1634 /* ========================================================================= */ 1635 /* Query buffer sizes for csrilu0, SpSV and allocate buffers */ 1636 /* ========================================================================= */ 1637 PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M)); 1638 if (m) 1639 PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1640 fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, &fs->factBufferSize_M)); 1641 1642 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 1643 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 1644 1645 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 1646 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 1647 1648 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 1649 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 1650 1651 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 1652 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 1653 1654 /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab, 1655 and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77, 1656 spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U. 1657 To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U. 1658 */ 1659 if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) { 1660 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 1661 fs->spsvBuffer_L = fs->factBuffer_M; 1662 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 1663 } else { 1664 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M))); 1665 fs->spsvBuffer_U = fs->factBuffer_M; 1666 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 1667 } 1668 1669 /* ========================================================================== */ 1670 /* Perform analysis of ilu0 on M, SpSv on L and U */ 1671 /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/ 1672 /* ========================================================================== */ 1673 int structural_zero; 1674 cusparseStatus_t status; 1675 1676 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1677 if (m) 1678 PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1679 fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1680 if (PetscDefined(USE_DEBUG)) { 1681 /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 1682 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero); 1683 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero); 1684 } 1685 1686 /* Estimate FLOPs of the numeric factorization */ 1687 { 1688 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 1689 PetscInt *Ai, *Adiag, nzRow, nzLeft; 1690 PetscLogDouble flops = 0.0; 1691 1692 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 1693 Ai = Aseq->i; 1694 Adiag = Aseq->diag; 1695 for (PetscInt i = 0; i < m; i++) { 1696 if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */ 1697 nzRow = Ai[i + 1] - Ai[i]; 1698 nzLeft = Adiag[i] - Ai[i]; 1699 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 1700 and include the eliminated one will be updated, which incurs a multiplication and an addition. 1701 */ 1702 nzLeft = (nzRow - 1) / 2; 1703 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 1704 } 1705 } 1706 fs->numericFactFlops = flops; 1707 } 1708 fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0; 1709 PetscFunctionReturn(0); 1710 } 1711 1712 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x) { 1713 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1714 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1715 const PetscScalar *barray; 1716 PetscScalar *xarray; 1717 1718 PetscFunctionBegin; 1719 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1720 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1721 PetscCall(PetscLogGpuTimeBegin()); 1722 1723 /* Solve L*y = b */ 1724 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1725 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1726 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */ 1727 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L)); 1728 1729 /* Solve Lt*x = y */ 1730 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1731 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */ 1732 fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt)); 1733 1734 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1735 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1736 1737 PetscCall(PetscLogGpuTimeEnd()); 1738 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1739 PetscFunctionReturn(0); 1740 } 1741 1742 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *info) { 1743 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1744 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1745 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1746 CsrMatrix *Acsr; 1747 PetscInt m, nz; 1748 PetscBool flg; 1749 1750 PetscFunctionBegin; 1751 if (PetscDefined(USE_DEBUG)) { 1752 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1753 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1754 } 1755 1756 /* Copy A's value to fact */ 1757 m = fact->rmap->n; 1758 nz = aij->nz; 1759 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1760 Acsr = (CsrMatrix *)Acusp->mat->mat; 1761 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1762 1763 /* Factorize fact inplace */ 1764 /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve 1765 Function csric02() only takes the lower triangular part of matrix A to perform factorization. 1766 The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored, 1767 and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not. 1768 In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided. 1769 */ 1770 if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 1771 if (PetscDefined(USE_DEBUG)) { 1772 int numerical_zero; 1773 cusparseStatus_t status; 1774 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero); 1775 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1776 } 1777 1778 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1779 1780 /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE 1781 ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F 1782 */ 1783 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1784 1785 fact->offloadmask = PETSC_OFFLOAD_GPU; 1786 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0; 1787 fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0; 1788 fact->ops->matsolve = NULL; 1789 fact->ops->matsolvetranspose = NULL; 1790 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1791 PetscFunctionReturn(0); 1792 } 1793 1794 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS perm, const MatFactorInfo *info) { 1795 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1796 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1797 PetscInt m, nz; 1798 1799 PetscFunctionBegin; 1800 if (PetscDefined(USE_DEBUG)) { 1801 PetscInt i; 1802 PetscBool flg, missing; 1803 1804 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1805 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1806 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 1807 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1808 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1809 } 1810 1811 /* Free the old stale stuff */ 1812 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 1813 1814 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1815 but they will not be used. Allocate them just for easy debugging. 1816 */ 1817 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 1818 1819 fact->offloadmask = PETSC_OFFLOAD_BOTH; 1820 fact->factortype = MAT_FACTOR_ICC; 1821 fact->info.factor_mallocs = 0; 1822 fact->info.fill_ratio_given = info->fill; 1823 fact->info.fill_ratio_needed = 1.0; 1824 1825 aij->row = NULL; 1826 aij->col = NULL; 1827 1828 /* ====================================================================== */ 1829 /* Copy A's i, j to fact and also allocate the value array of fact. */ 1830 /* We'll do in-place factorization on fact */ 1831 /* ====================================================================== */ 1832 const int *Ai, *Aj; 1833 1834 m = fact->rmap->n; 1835 nz = aij->nz; 1836 1837 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1))); 1838 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz)); 1839 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz)); 1840 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */ 1841 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1842 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1843 1844 /* ====================================================================== */ 1845 /* Create mat descriptors for M, L */ 1846 /* ====================================================================== */ 1847 cusparseFillMode_t fillMode; 1848 cusparseDiagType_t diagType; 1849 1850 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 1851 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 1852 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 1853 1854 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 1855 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1856 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1857 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1858 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1859 */ 1860 fillMode = CUSPARSE_FILL_MODE_LOWER; 1861 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 1862 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1863 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1864 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1865 1866 /* ========================================================================= */ 1867 /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */ 1868 /* ========================================================================= */ 1869 PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M)); 1870 if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, &fs->factBufferSize_M)); 1871 1872 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 1873 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 1874 1875 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 1876 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 1877 1878 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 1879 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 1880 1881 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 1882 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 1883 1884 /* To save device memory, we make the factorization buffer share with one of the solver buffer. 1885 See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(). 1886 */ 1887 if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) { 1888 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 1889 fs->spsvBuffer_L = fs->factBuffer_M; 1890 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 1891 } else { 1892 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M))); 1893 fs->spsvBuffer_Lt = fs->factBuffer_M; 1894 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 1895 } 1896 1897 /* ========================================================================== */ 1898 /* Perform analysis of ic0 on M */ 1899 /* The lower triangular part of M has the same sparsity pattern as L */ 1900 /* ========================================================================== */ 1901 int structural_zero; 1902 cusparseStatus_t status; 1903 1904 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1905 if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 1906 if (PetscDefined(USE_DEBUG)) { 1907 /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 1908 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero); 1909 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero); 1910 } 1911 1912 /* Estimate FLOPs of the numeric factorization */ 1913 { 1914 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 1915 PetscInt *Ai, nzRow, nzLeft; 1916 PetscLogDouble flops = 0.0; 1917 1918 Ai = Aseq->i; 1919 for (PetscInt i = 0; i < m; i++) { 1920 nzRow = Ai[i + 1] - Ai[i]; 1921 if (nzRow > 1) { 1922 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 1923 and include the eliminated one will be updated, which incurs a multiplication and an addition. 1924 */ 1925 nzLeft = (nzRow - 1) / 2; 1926 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 1927 } 1928 } 1929 fs->numericFactFlops = flops; 1930 } 1931 fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0; 1932 PetscFunctionReturn(0); 1933 } 1934 #endif 1935 1936 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) { 1937 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 1938 1939 PetscFunctionBegin; 1940 #if CUSPARSE_VERSION >= 11500 1941 PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 1942 if (cusparseTriFactors->factorizeOnDevice) { 1943 PetscCall(ISIdentity(isrow, &row_identity)); 1944 PetscCall(ISIdentity(iscol, &col_identity)); 1945 } 1946 if (!info->levels && row_identity && col_identity) { 1947 PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info)); 1948 } else 1949 #endif 1950 { 1951 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 1952 PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1953 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 1954 } 1955 PetscFunctionReturn(0); 1956 } 1957 1958 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) { 1959 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 1960 1961 PetscFunctionBegin; 1962 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 1963 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1964 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 1965 PetscFunctionReturn(0); 1966 } 1967 1968 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) { 1969 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 1970 1971 PetscFunctionBegin; 1972 #if CUSPARSE_VERSION >= 11500 1973 PetscBool perm_identity = PETSC_FALSE; 1974 if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity)); 1975 if (!info->levels && perm_identity) { 1976 PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info)); 1977 } else 1978 #endif 1979 { 1980 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 1981 PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 1982 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 1983 } 1984 PetscFunctionReturn(0); 1985 } 1986 1987 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) { 1988 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 1989 1990 PetscFunctionBegin; 1991 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 1992 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); 1993 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 1994 PetscFunctionReturn(0); 1995 } 1996 1997 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A, MatSolverType *type) { 1998 PetscFunctionBegin; 1999 *type = MATSOLVERCUSPARSE; 2000 PetscFunctionReturn(0); 2001 } 2002 2003 /*MC 2004 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 2005 on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported 2006 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 2007 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 2008 CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 2009 algorithms are not recommended. This class does NOT support direct solver operations. 2010 2011 Level: beginner 2012 2013 .seealso: `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 2014 M*/ 2015 2016 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B) { 2017 PetscInt n = A->rmap->n; 2018 PetscBool factOnDevice, factOnHost; 2019 char *prefix; 2020 char factPlace[32] = "device"; /* the default */ 2021 2022 PetscFunctionBegin; 2023 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2024 PetscCall(MatSetSizes(*B, n, n, n, n)); 2025 (*B)->factortype = ftype; 2026 PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE)); 2027 2028 prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix; 2029 PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)), prefix, "MatGetFactor", "Mat"); 2030 PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL)); 2031 PetscOptionsEnd(); 2032 PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice)); 2033 PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost)); 2034 PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)(*B)), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace); 2035 ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice; 2036 2037 if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE)); 2038 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2039 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2040 if (!A->boundtocpu) { 2041 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 2042 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 2043 } else { 2044 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 2045 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 2046 } 2047 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 2048 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2049 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2050 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2051 if (!A->boundtocpu) { 2052 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 2053 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 2054 } else { 2055 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 2056 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 2057 } 2058 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2059 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2060 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types"); 2061 2062 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 2063 (*B)->canuseordering = PETSC_TRUE; 2064 PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse)); 2065 PetscFunctionReturn(0); 2066 } 2067 2068 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) { 2069 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2070 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2071 #if CUSPARSE_VERSION >= 13500 2072 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 2073 #endif 2074 2075 PetscFunctionBegin; 2076 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 2077 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2078 if (A->factortype == MAT_FACTOR_NONE) { 2079 CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat; 2080 PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2081 } 2082 #if CUSPARSE_VERSION >= 13500 2083 else if (fs->csrVal) { 2084 /* We have a factorized matrix on device and are able to copy it to host */ 2085 PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2086 } 2087 #endif 2088 else 2089 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host"); 2090 PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar))); 2091 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2092 A->offloadmask = PETSC_OFFLOAD_BOTH; 2093 } 2094 PetscFunctionReturn(0); 2095 } 2096 2097 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) { 2098 PetscFunctionBegin; 2099 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2100 *array = ((Mat_SeqAIJ *)A->data)->a; 2101 PetscFunctionReturn(0); 2102 } 2103 2104 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) { 2105 PetscFunctionBegin; 2106 A->offloadmask = PETSC_OFFLOAD_CPU; 2107 *array = NULL; 2108 PetscFunctionReturn(0); 2109 } 2110 2111 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) { 2112 PetscFunctionBegin; 2113 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2114 *array = ((Mat_SeqAIJ *)A->data)->a; 2115 PetscFunctionReturn(0); 2116 } 2117 2118 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) { 2119 PetscFunctionBegin; 2120 *array = NULL; 2121 PetscFunctionReturn(0); 2122 } 2123 2124 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) { 2125 PetscFunctionBegin; 2126 *array = ((Mat_SeqAIJ *)A->data)->a; 2127 PetscFunctionReturn(0); 2128 } 2129 2130 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) { 2131 PetscFunctionBegin; 2132 A->offloadmask = PETSC_OFFLOAD_CPU; 2133 *array = NULL; 2134 PetscFunctionReturn(0); 2135 } 2136 2137 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) { 2138 Mat_SeqAIJCUSPARSE *cusp; 2139 CsrMatrix *matrix; 2140 2141 PetscFunctionBegin; 2142 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2143 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 2144 cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 2145 PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL"); 2146 matrix = (CsrMatrix *)cusp->mat->mat; 2147 2148 if (i) { 2149 #if !defined(PETSC_USE_64BIT_INDICES) 2150 *i = matrix->row_offsets->data().get(); 2151 #else 2152 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 2153 #endif 2154 } 2155 if (j) { 2156 #if !defined(PETSC_USE_64BIT_INDICES) 2157 *j = matrix->column_indices->data().get(); 2158 #else 2159 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 2160 #endif 2161 } 2162 if (a) *a = matrix->values->data().get(); 2163 if (mtype) *mtype = PETSC_MEMTYPE_CUDA; 2164 PetscFunctionReturn(0); 2165 } 2166 2167 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) { 2168 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 2169 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 2170 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2171 PetscInt m = A->rmap->n, *ii, *ridx, tmp; 2172 cusparseStatus_t stat; 2173 PetscBool both = PETSC_TRUE; 2174 2175 PetscFunctionBegin; 2176 PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU"); 2177 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 2178 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 2179 CsrMatrix *matrix; 2180 matrix = (CsrMatrix *)cusparsestruct->mat->mat; 2181 2182 PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values"); 2183 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2184 matrix->values->assign(a->a, a->a + a->nz); 2185 PetscCallCUDA(WaitForCUDA()); 2186 PetscCall(PetscLogCpuToGpu((a->nz) * sizeof(PetscScalar))); 2187 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2188 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 2189 } else { 2190 PetscInt nnz; 2191 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2192 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format)); 2193 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 2194 delete cusparsestruct->workVector; 2195 delete cusparsestruct->rowoffsets_gpu; 2196 cusparsestruct->workVector = NULL; 2197 cusparsestruct->rowoffsets_gpu = NULL; 2198 try { 2199 if (a->compressedrow.use) { 2200 m = a->compressedrow.nrows; 2201 ii = a->compressedrow.i; 2202 ridx = a->compressedrow.rindex; 2203 } else { 2204 m = A->rmap->n; 2205 ii = a->i; 2206 ridx = NULL; 2207 } 2208 PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data"); 2209 if (!a->a) { 2210 nnz = ii[m]; 2211 both = PETSC_FALSE; 2212 } else nnz = a->nz; 2213 PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data"); 2214 2215 /* create cusparse matrix */ 2216 cusparsestruct->nrows = m; 2217 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 2218 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr)); 2219 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO)); 2220 PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 2221 2222 PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar))); 2223 PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero), sizeof(PetscScalar))); 2224 PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar))); 2225 PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2226 PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2227 PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2228 PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2229 2230 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 2231 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2232 /* set the matrix */ 2233 CsrMatrix *mat = new CsrMatrix; 2234 mat->num_rows = m; 2235 mat->num_cols = A->cmap->n; 2236 mat->num_entries = nnz; 2237 mat->row_offsets = new THRUSTINTARRAY32(m + 1); 2238 mat->row_offsets->assign(ii, ii + m + 1); 2239 2240 mat->column_indices = new THRUSTINTARRAY32(nnz); 2241 mat->column_indices->assign(a->j, a->j + nnz); 2242 2243 mat->values = new THRUSTARRAY(nnz); 2244 if (a->a) mat->values->assign(a->a, a->a + nnz); 2245 2246 /* assign the pointer */ 2247 matstruct->mat = mat; 2248 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2249 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 2250 stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2251 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2252 PetscCallCUSPARSE(stat); 2253 } 2254 #endif 2255 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 2256 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2257 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2258 #else 2259 CsrMatrix *mat = new CsrMatrix; 2260 mat->num_rows = m; 2261 mat->num_cols = A->cmap->n; 2262 mat->num_entries = nnz; 2263 mat->row_offsets = new THRUSTINTARRAY32(m + 1); 2264 mat->row_offsets->assign(ii, ii + m + 1); 2265 2266 mat->column_indices = new THRUSTINTARRAY32(nnz); 2267 mat->column_indices->assign(a->j, a->j + nnz); 2268 2269 mat->values = new THRUSTARRAY(nnz); 2270 if (a->a) mat->values->assign(a->a, a->a + nnz); 2271 2272 cusparseHybMat_t hybMat; 2273 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 2274 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 2275 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition); 2276 PetscCallCUSPARSE(stat); 2277 /* assign the pointer */ 2278 matstruct->mat = hybMat; 2279 2280 if (mat) { 2281 if (mat->values) delete (THRUSTARRAY *)mat->values; 2282 if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices; 2283 if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets; 2284 delete (CsrMatrix *)mat; 2285 } 2286 #endif 2287 } 2288 2289 /* assign the compressed row indices */ 2290 if (a->compressedrow.use) { 2291 cusparsestruct->workVector = new THRUSTARRAY(m); 2292 matstruct->cprowIndices = new THRUSTINTARRAY(m); 2293 matstruct->cprowIndices->assign(ridx, ridx + m); 2294 tmp = m; 2295 } else { 2296 cusparsestruct->workVector = NULL; 2297 matstruct->cprowIndices = NULL; 2298 tmp = 0; 2299 } 2300 PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar))); 2301 2302 /* assign the pointer */ 2303 cusparsestruct->mat = matstruct; 2304 } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); } 2305 PetscCallCUDA(WaitForCUDA()); 2306 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2307 cusparsestruct->nonzerostate = A->nonzerostate; 2308 } 2309 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 2310 } 2311 PetscFunctionReturn(0); 2312 } 2313 2314 struct VecCUDAPlusEquals { 2315 template <typename Tuple> 2316 __host__ __device__ void operator()(Tuple t) { 2317 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 2318 } 2319 }; 2320 2321 struct VecCUDAEquals { 2322 template <typename Tuple> 2323 __host__ __device__ void operator()(Tuple t) { 2324 thrust::get<1>(t) = thrust::get<0>(t); 2325 } 2326 }; 2327 2328 struct VecCUDAEqualsReverse { 2329 template <typename Tuple> 2330 __host__ __device__ void operator()(Tuple t) { 2331 thrust::get<0>(t) = thrust::get<1>(t); 2332 } 2333 }; 2334 2335 struct MatMatCusparse { 2336 PetscBool cisdense; 2337 PetscScalar *Bt; 2338 Mat X; 2339 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2340 PetscLogDouble flops; 2341 CsrMatrix *Bcsr; 2342 2343 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2344 cusparseSpMatDescr_t matSpBDescr; 2345 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2346 cusparseDnMatDescr_t matBDescr; 2347 cusparseDnMatDescr_t matCDescr; 2348 PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/ 2349 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2350 void *dBuffer4; 2351 void *dBuffer5; 2352 #endif 2353 size_t mmBufferSize; 2354 void *mmBuffer; 2355 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2356 cusparseSpGEMMDescr_t spgemmDesc; 2357 #endif 2358 }; 2359 2360 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) { 2361 MatMatCusparse *mmdata = (MatMatCusparse *)data; 2362 2363 PetscFunctionBegin; 2364 PetscCallCUDA(cudaFree(mmdata->Bt)); 2365 delete mmdata->Bcsr; 2366 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2367 if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr)); 2368 if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2369 if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2370 if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2371 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2372 if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4)); 2373 if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5)); 2374 #endif 2375 if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2376 if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2)); 2377 #endif 2378 PetscCall(MatDestroy(&mmdata->X)); 2379 PetscCall(PetscFree(data)); 2380 PetscFunctionReturn(0); 2381 } 2382 2383 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat, Mat, Mat, PetscBool, PetscBool); 2384 2385 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) { 2386 Mat_Product *product = C->product; 2387 Mat A, B; 2388 PetscInt m, n, blda, clda; 2389 PetscBool flg, biscuda; 2390 Mat_SeqAIJCUSPARSE *cusp; 2391 cusparseStatus_t stat; 2392 cusparseOperation_t opA; 2393 const PetscScalar *barray; 2394 PetscScalar *carray; 2395 MatMatCusparse *mmdata; 2396 Mat_SeqAIJCUSPARSEMultStruct *mat; 2397 CsrMatrix *csrmat; 2398 2399 PetscFunctionBegin; 2400 MatCheckProduct(C, 1); 2401 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2402 mmdata = (MatMatCusparse *)product->data; 2403 A = product->A; 2404 B = product->B; 2405 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2406 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2407 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2408 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2409 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2410 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2411 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2412 switch (product->type) { 2413 case MATPRODUCT_AB: 2414 case MATPRODUCT_PtAP: 2415 mat = cusp->mat; 2416 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2417 m = A->rmap->n; 2418 n = B->cmap->n; 2419 break; 2420 case MATPRODUCT_AtB: 2421 if (!A->form_explicit_transpose) { 2422 mat = cusp->mat; 2423 opA = CUSPARSE_OPERATION_TRANSPOSE; 2424 } else { 2425 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2426 mat = cusp->matTranspose; 2427 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2428 } 2429 m = A->cmap->n; 2430 n = B->cmap->n; 2431 break; 2432 case MATPRODUCT_ABt: 2433 case MATPRODUCT_RARt: 2434 mat = cusp->mat; 2435 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2436 m = A->rmap->n; 2437 n = B->rmap->n; 2438 break; 2439 default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2440 } 2441 PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 2442 csrmat = (CsrMatrix *)mat->mat; 2443 /* if the user passed a CPU matrix, copy the data to the GPU */ 2444 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda)); 2445 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B)); 2446 PetscCall(MatDenseCUDAGetArrayRead(B, &barray)); 2447 2448 PetscCall(MatDenseGetLDA(B, &blda)); 2449 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2450 PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X, &carray)); 2451 PetscCall(MatDenseGetLDA(mmdata->X, &clda)); 2452 } else { 2453 PetscCall(MatDenseCUDAGetArrayWrite(C, &carray)); 2454 PetscCall(MatDenseGetLDA(C, &clda)); 2455 } 2456 2457 PetscCall(PetscLogGpuTimeBegin()); 2458 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2459 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2460 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2461 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2462 size_t mmBufferSize; 2463 if (mmdata->initialized && mmdata->Blda != blda) { 2464 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2465 mmdata->matBDescr = NULL; 2466 } 2467 if (!mmdata->matBDescr) { 2468 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2469 mmdata->Blda = blda; 2470 } 2471 2472 if (mmdata->initialized && mmdata->Clda != clda) { 2473 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2474 mmdata->matCDescr = NULL; 2475 } 2476 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2477 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2478 mmdata->Clda = clda; 2479 } 2480 2481 if (!mat->matDescr) { 2482 stat = cusparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2483 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2484 PetscCallCUSPARSE(stat); 2485 } 2486 stat = cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize); 2487 PetscCallCUSPARSE(stat); 2488 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2489 PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2490 PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize)); 2491 mmdata->mmBufferSize = mmBufferSize; 2492 } 2493 mmdata->initialized = PETSC_TRUE; 2494 } else { 2495 /* to be safe, always update pointers of the mats */ 2496 PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr, csrmat->values->data().get())); 2497 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray)); 2498 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray)); 2499 } 2500 2501 /* do cusparseSpMM, which supports transpose on B */ 2502 stat = cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer); 2503 PetscCallCUSPARSE(stat); 2504 #else 2505 PetscInt k; 2506 /* cusparseXcsrmm does not support transpose on B */ 2507 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2508 cublasHandle_t cublasv2handle; 2509 cublasStatus_t cerr; 2510 2511 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 2512 cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n); 2513 PetscCallCUBLAS(cerr); 2514 blda = B->cmap->n; 2515 k = B->cmap->n; 2516 } else { 2517 k = B->rmap->n; 2518 } 2519 2520 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2521 stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda); 2522 PetscCallCUSPARSE(stat); 2523 #endif 2524 PetscCall(PetscLogGpuTimeEnd()); 2525 PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries)); 2526 PetscCall(MatDenseCUDARestoreArrayRead(B, &barray)); 2527 if (product->type == MATPRODUCT_RARt) { 2528 PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X, &carray)); 2529 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE)); 2530 } else if (product->type == MATPRODUCT_PtAP) { 2531 PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X, &carray)); 2532 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE)); 2533 } else { 2534 PetscCall(MatDenseCUDARestoreArrayWrite(C, &carray)); 2535 } 2536 if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C)); 2537 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B)); 2538 PetscFunctionReturn(0); 2539 } 2540 2541 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) { 2542 Mat_Product *product = C->product; 2543 Mat A, B; 2544 PetscInt m, n; 2545 PetscBool cisdense, flg; 2546 MatMatCusparse *mmdata; 2547 Mat_SeqAIJCUSPARSE *cusp; 2548 2549 PetscFunctionBegin; 2550 MatCheckProduct(C, 1); 2551 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 2552 A = product->A; 2553 B = product->B; 2554 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2555 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2556 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2557 PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2558 switch (product->type) { 2559 case MATPRODUCT_AB: 2560 m = A->rmap->n; 2561 n = B->cmap->n; 2562 break; 2563 case MATPRODUCT_AtB: 2564 m = A->cmap->n; 2565 n = B->cmap->n; 2566 break; 2567 case MATPRODUCT_ABt: 2568 m = A->rmap->n; 2569 n = B->rmap->n; 2570 break; 2571 case MATPRODUCT_PtAP: 2572 m = B->cmap->n; 2573 n = B->cmap->n; 2574 break; 2575 case MATPRODUCT_RARt: 2576 m = B->rmap->n; 2577 n = B->rmap->n; 2578 break; 2579 default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2580 } 2581 PetscCall(MatSetSizes(C, m, n, m, n)); 2582 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2583 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense)); 2584 PetscCall(MatSetType(C, MATSEQDENSECUDA)); 2585 2586 /* product data */ 2587 PetscCall(PetscNew(&mmdata)); 2588 mmdata->cisdense = cisdense; 2589 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0) 2590 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2591 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar))); 2592 #endif 2593 /* for these products we need intermediate storage */ 2594 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2595 PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X)); 2596 PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA)); 2597 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2598 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n)); 2599 } else { 2600 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n)); 2601 } 2602 } 2603 C->product->data = mmdata; 2604 C->product->destroy = MatDestroy_MatMatCusparse; 2605 2606 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2607 PetscFunctionReturn(0); 2608 } 2609 2610 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) { 2611 Mat_Product *product = C->product; 2612 Mat A, B; 2613 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 2614 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 2615 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 2616 CsrMatrix *Acsr, *Bcsr, *Ccsr; 2617 PetscBool flg; 2618 cusparseStatus_t stat; 2619 MatProductType ptype; 2620 MatMatCusparse *mmdata; 2621 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2622 cusparseSpMatDescr_t BmatSpDescr; 2623 #endif 2624 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2625 2626 PetscFunctionBegin; 2627 MatCheckProduct(C, 1); 2628 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2629 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg)); 2630 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name); 2631 mmdata = (MatMatCusparse *)C->product->data; 2632 A = product->A; 2633 B = product->B; 2634 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2635 mmdata->reusesym = PETSC_FALSE; 2636 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2637 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2638 Cmat = Ccusp->mat; 2639 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]); 2640 Ccsr = (CsrMatrix *)Cmat->mat; 2641 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2642 goto finalize; 2643 } 2644 if (!c->nz) goto finalize; 2645 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2646 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2647 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 2648 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 2649 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2650 PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2651 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2652 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 2653 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2654 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2655 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2656 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2657 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2658 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2659 2660 ptype = product->type; 2661 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 2662 ptype = MATPRODUCT_AB; 2663 PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric"); 2664 } 2665 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 2666 ptype = MATPRODUCT_AB; 2667 PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric"); 2668 } 2669 switch (ptype) { 2670 case MATPRODUCT_AB: 2671 Amat = Acusp->mat; 2672 Bmat = Bcusp->mat; 2673 break; 2674 case MATPRODUCT_AtB: 2675 Amat = Acusp->matTranspose; 2676 Bmat = Bcusp->mat; 2677 break; 2678 case MATPRODUCT_ABt: 2679 Amat = Acusp->mat; 2680 Bmat = Bcusp->matTranspose; 2681 break; 2682 default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2683 } 2684 Cmat = Ccusp->mat; 2685 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 2686 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 2687 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]); 2688 Acsr = (CsrMatrix *)Amat->mat; 2689 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */ 2690 Ccsr = (CsrMatrix *)Cmat->mat; 2691 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 2692 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 2693 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2694 PetscCall(PetscLogGpuTimeBegin()); 2695 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2696 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2697 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2698 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2699 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2700 PetscCallCUSPARSE(stat); 2701 #else 2702 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 2703 PetscCallCUSPARSE(stat); 2704 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2705 PetscCallCUSPARSE(stat); 2706 #endif 2707 #else 2708 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 2709 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 2710 PetscCallCUSPARSE(stat); 2711 #endif 2712 PetscCall(PetscLogGpuFlops(mmdata->flops)); 2713 PetscCallCUDA(WaitForCUDA()); 2714 PetscCall(PetscLogGpuTimeEnd()); 2715 C->offloadmask = PETSC_OFFLOAD_GPU; 2716 finalize: 2717 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2718 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 2719 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 2720 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 2721 c->reallocs = 0; 2722 C->info.mallocs += 0; 2723 C->info.nz_unneeded = 0; 2724 C->assembled = C->was_assembled = PETSC_TRUE; 2725 C->num_ass++; 2726 PetscFunctionReturn(0); 2727 } 2728 2729 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) { 2730 Mat_Product *product = C->product; 2731 Mat A, B; 2732 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 2733 Mat_SeqAIJ *a, *b, *c; 2734 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 2735 CsrMatrix *Acsr, *Bcsr, *Ccsr; 2736 PetscInt i, j, m, n, k; 2737 PetscBool flg; 2738 cusparseStatus_t stat; 2739 MatProductType ptype; 2740 MatMatCusparse *mmdata; 2741 PetscLogDouble flops; 2742 PetscBool biscompressed, ciscompressed; 2743 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2744 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2745 cusparseSpMatDescr_t BmatSpDescr; 2746 #else 2747 int cnz; 2748 #endif 2749 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2750 2751 PetscFunctionBegin; 2752 MatCheckProduct(C, 1); 2753 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 2754 A = product->A; 2755 B = product->B; 2756 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2757 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2758 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 2759 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 2760 a = (Mat_SeqAIJ *)A->data; 2761 b = (Mat_SeqAIJ *)B->data; 2762 /* product data */ 2763 PetscCall(PetscNew(&mmdata)); 2764 C->product->data = mmdata; 2765 C->product->destroy = MatDestroy_MatMatCusparse; 2766 2767 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2768 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2769 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 2770 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 2771 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2772 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2773 2774 ptype = product->type; 2775 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 2776 ptype = MATPRODUCT_AB; 2777 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 2778 } 2779 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 2780 ptype = MATPRODUCT_AB; 2781 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 2782 } 2783 biscompressed = PETSC_FALSE; 2784 ciscompressed = PETSC_FALSE; 2785 switch (ptype) { 2786 case MATPRODUCT_AB: 2787 m = A->rmap->n; 2788 n = B->cmap->n; 2789 k = A->cmap->n; 2790 Amat = Acusp->mat; 2791 Bmat = Bcusp->mat; 2792 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2793 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2794 break; 2795 case MATPRODUCT_AtB: 2796 m = A->cmap->n; 2797 n = B->cmap->n; 2798 k = A->rmap->n; 2799 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2800 Amat = Acusp->matTranspose; 2801 Bmat = Bcusp->mat; 2802 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2803 break; 2804 case MATPRODUCT_ABt: 2805 m = A->rmap->n; 2806 n = B->rmap->n; 2807 k = A->cmap->n; 2808 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 2809 Amat = Acusp->mat; 2810 Bmat = Bcusp->matTranspose; 2811 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2812 break; 2813 default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2814 } 2815 2816 /* create cusparse matrix */ 2817 PetscCall(MatSetSizes(C, m, n, m, n)); 2818 PetscCall(MatSetType(C, MATSEQAIJCUSPARSE)); 2819 c = (Mat_SeqAIJ *)C->data; 2820 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2821 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2822 Ccsr = new CsrMatrix; 2823 2824 c->compressedrow.use = ciscompressed; 2825 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2826 c->compressedrow.nrows = a->compressedrow.nrows; 2827 PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex)); 2828 PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows)); 2829 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2830 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2831 Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows); 2832 } else { 2833 c->compressedrow.nrows = 0; 2834 c->compressedrow.i = NULL; 2835 c->compressedrow.rindex = NULL; 2836 Ccusp->workVector = NULL; 2837 Cmat->cprowIndices = NULL; 2838 } 2839 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2840 Ccusp->mat = Cmat; 2841 Ccusp->mat->mat = Ccsr; 2842 Ccsr->num_rows = Ccusp->nrows; 2843 Ccsr->num_cols = n; 2844 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1); 2845 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 2846 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 2847 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 2848 PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar))); 2849 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar))); 2850 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 2851 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2852 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2853 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2854 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2855 thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0); 2856 c->nz = 0; 2857 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2858 Ccsr->values = new THRUSTARRAY(c->nz); 2859 goto finalizesym; 2860 } 2861 2862 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 2863 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 2864 Acsr = (CsrMatrix *)Amat->mat; 2865 if (!biscompressed) { 2866 Bcsr = (CsrMatrix *)Bmat->mat; 2867 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2868 BmatSpDescr = Bmat->matDescr; 2869 #endif 2870 } else { /* we need to use row offsets for the full matrix */ 2871 CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat; 2872 Bcsr = new CsrMatrix; 2873 Bcsr->num_rows = B->rmap->n; 2874 Bcsr->num_cols = cBcsr->num_cols; 2875 Bcsr->num_entries = cBcsr->num_entries; 2876 Bcsr->column_indices = cBcsr->column_indices; 2877 Bcsr->values = cBcsr->values; 2878 if (!Bcusp->rowoffsets_gpu) { 2879 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2880 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 2881 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 2882 } 2883 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2884 mmdata->Bcsr = Bcsr; 2885 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2886 if (Bcsr->num_rows && Bcsr->num_cols) { 2887 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2888 PetscCallCUSPARSE(stat); 2889 } 2890 BmatSpDescr = mmdata->matSpBDescr; 2891 #endif 2892 } 2893 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 2894 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 2895 /* precompute flops count */ 2896 if (ptype == MATPRODUCT_AB) { 2897 for (i = 0, flops = 0; i < A->rmap->n; i++) { 2898 const PetscInt st = a->i[i]; 2899 const PetscInt en = a->i[i + 1]; 2900 for (j = st; j < en; j++) { 2901 const PetscInt brow = a->j[j]; 2902 flops += 2. * (b->i[brow + 1] - b->i[brow]); 2903 } 2904 } 2905 } else if (ptype == MATPRODUCT_AtB) { 2906 for (i = 0, flops = 0; i < A->rmap->n; i++) { 2907 const PetscInt anzi = a->i[i + 1] - a->i[i]; 2908 const PetscInt bnzi = b->i[i + 1] - b->i[i]; 2909 flops += (2. * anzi) * bnzi; 2910 } 2911 } else { /* TODO */ 2912 flops = 0.; 2913 } 2914 2915 mmdata->flops = flops; 2916 PetscCall(PetscLogGpuTimeBegin()); 2917 2918 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2919 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2920 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, NULL, NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2921 PetscCallCUSPARSE(stat); 2922 PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 2923 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2924 { 2925 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 2926 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 2927 */ 2928 void *dBuffer1 = NULL; 2929 void *dBuffer2 = NULL; 2930 void *dBuffer3 = NULL; 2931 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 2932 size_t bufferSize1 = 0; 2933 size_t bufferSize2 = 0; 2934 size_t bufferSize3 = 0; 2935 size_t bufferSize4 = 0; 2936 size_t bufferSize5 = 0; 2937 2938 /*----------------------------------------------------------------------*/ 2939 /* ask bufferSize1 bytes for external memory */ 2940 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL); 2941 PetscCallCUSPARSE(stat); 2942 PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1)); 2943 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2944 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1); 2945 PetscCallCUSPARSE(stat); 2946 2947 /*----------------------------------------------------------------------*/ 2948 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL); 2949 PetscCallCUSPARSE(stat); 2950 PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2)); 2951 PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3)); 2952 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4)); 2953 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4); 2954 PetscCallCUSPARSE(stat); 2955 PetscCallCUDA(cudaFree(dBuffer1)); 2956 PetscCallCUDA(cudaFree(dBuffer2)); 2957 2958 /*----------------------------------------------------------------------*/ 2959 /* get matrix C non-zero entries C_nnz1 */ 2960 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 2961 c->nz = (PetscInt)C_nnz1; 2962 /* allocate matrix C */ 2963 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2964 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2965 Ccsr->values = new THRUSTARRAY(c->nz); 2966 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2967 /* update matC with the new pointers */ 2968 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 2969 PetscCallCUSPARSE(stat); 2970 2971 /*----------------------------------------------------------------------*/ 2972 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL); 2973 PetscCallCUSPARSE(stat); 2974 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5)); 2975 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5); 2976 PetscCallCUSPARSE(stat); 2977 PetscCallCUDA(cudaFree(dBuffer3)); 2978 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2979 PetscCallCUSPARSE(stat); 2980 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024)); 2981 } 2982 #else 2983 size_t bufSize2; 2984 /* ask bufferSize bytes for external memory */ 2985 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL); 2986 PetscCallCUSPARSE(stat); 2987 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2)); 2988 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2989 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2); 2990 PetscCallCUSPARSE(stat); 2991 /* ask bufferSize again bytes for external memory */ 2992 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL); 2993 PetscCallCUSPARSE(stat); 2994 /* The CUSPARSE documentation is not clear, nor the API 2995 We need both buffers to perform the operations properly! 2996 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2997 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2998 is stored in the descriptor! What a messy API... */ 2999 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize)); 3000 /* compute the intermediate product of A * B */ 3001 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 3002 PetscCallCUSPARSE(stat); 3003 /* get matrix C non-zero entries C_nnz1 */ 3004 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3005 c->nz = (PetscInt)C_nnz1; 3006 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024, 3007 mmdata->mmBufferSize / 1024)); 3008 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3009 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3010 Ccsr->values = new THRUSTARRAY(c->nz); 3011 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3012 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 3013 PetscCallCUSPARSE(stat); 3014 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 3015 PetscCallCUSPARSE(stat); 3016 #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3017 #else 3018 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST)); 3019 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 3020 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz); 3021 PetscCallCUSPARSE(stat); 3022 c->nz = cnz; 3023 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3024 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3025 Ccsr->values = new THRUSTARRAY(c->nz); 3026 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3027 3028 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3029 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 3030 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 3031 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 3032 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 3033 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 3034 PetscCallCUSPARSE(stat); 3035 #endif 3036 PetscCall(PetscLogGpuFlops(mmdata->flops)); 3037 PetscCall(PetscLogGpuTimeEnd()); 3038 finalizesym: 3039 c->singlemalloc = PETSC_FALSE; 3040 c->free_a = PETSC_TRUE; 3041 c->free_ij = PETSC_TRUE; 3042 PetscCall(PetscMalloc1(m + 1, &c->i)); 3043 PetscCall(PetscMalloc1(c->nz, &c->j)); 3044 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 3045 PetscInt *d_i = c->i; 3046 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 3047 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 3048 ii = *Ccsr->row_offsets; 3049 jj = *Ccsr->column_indices; 3050 if (ciscompressed) d_i = c->compressedrow.i; 3051 PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3052 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3053 } else { 3054 PetscInt *d_i = c->i; 3055 if (ciscompressed) d_i = c->compressedrow.i; 3056 PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3057 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3058 } 3059 if (ciscompressed) { /* need to expand host row offsets */ 3060 PetscInt r = 0; 3061 c->i[0] = 0; 3062 for (k = 0; k < c->compressedrow.nrows; k++) { 3063 const PetscInt next = c->compressedrow.rindex[k]; 3064 const PetscInt old = c->compressedrow.i[k]; 3065 for (; r < next; r++) c->i[r + 1] = old; 3066 } 3067 for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows]; 3068 } 3069 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 3070 PetscCall(PetscMalloc1(m, &c->ilen)); 3071 PetscCall(PetscMalloc1(m, &c->imax)); 3072 c->maxnz = c->nz; 3073 c->nonzerorowcnt = 0; 3074 c->rmax = 0; 3075 for (k = 0; k < m; k++) { 3076 const PetscInt nn = c->i[k + 1] - c->i[k]; 3077 c->ilen[k] = c->imax[k] = nn; 3078 c->nonzerorowcnt += (PetscInt) !!nn; 3079 c->rmax = PetscMax(c->rmax, nn); 3080 } 3081 PetscCall(MatMarkDiagonal_SeqAIJ(C)); 3082 PetscCall(PetscMalloc1(c->nz, &c->a)); 3083 Ccsr->num_entries = c->nz; 3084 3085 C->nonzerostate++; 3086 PetscCall(PetscLayoutSetUp(C->rmap)); 3087 PetscCall(PetscLayoutSetUp(C->cmap)); 3088 Ccusp->nonzerostate = C->nonzerostate; 3089 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3090 C->preallocated = PETSC_TRUE; 3091 C->assembled = PETSC_FALSE; 3092 C->was_assembled = PETSC_FALSE; 3093 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 3094 mmdata->reusesym = PETSC_TRUE; 3095 C->offloadmask = PETSC_OFFLOAD_GPU; 3096 } 3097 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3098 PetscFunctionReturn(0); 3099 } 3100 3101 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 3102 3103 /* handles sparse or dense B */ 3104 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) { 3105 Mat_Product *product = mat->product; 3106 PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE; 3107 3108 PetscFunctionBegin; 3109 MatCheckProduct(mat, 1); 3110 PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense)); 3111 if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp)); 3112 if (product->type == MATPRODUCT_ABC) { 3113 Ciscusp = PETSC_FALSE; 3114 if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp)); 3115 } 3116 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 3117 PetscBool usecpu = PETSC_FALSE; 3118 switch (product->type) { 3119 case MATPRODUCT_AB: 3120 if (product->api_user) { 3121 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat"); 3122 PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3123 PetscOptionsEnd(); 3124 } else { 3125 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat"); 3126 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3127 PetscOptionsEnd(); 3128 } 3129 break; 3130 case MATPRODUCT_AtB: 3131 if (product->api_user) { 3132 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat"); 3133 PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3134 PetscOptionsEnd(); 3135 } else { 3136 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat"); 3137 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3138 PetscOptionsEnd(); 3139 } 3140 break; 3141 case MATPRODUCT_PtAP: 3142 if (product->api_user) { 3143 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat"); 3144 PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3145 PetscOptionsEnd(); 3146 } else { 3147 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat"); 3148 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3149 PetscOptionsEnd(); 3150 } 3151 break; 3152 case MATPRODUCT_RARt: 3153 if (product->api_user) { 3154 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat"); 3155 PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3156 PetscOptionsEnd(); 3157 } else { 3158 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat"); 3159 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3160 PetscOptionsEnd(); 3161 } 3162 break; 3163 case MATPRODUCT_ABC: 3164 if (product->api_user) { 3165 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat"); 3166 PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3167 PetscOptionsEnd(); 3168 } else { 3169 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat"); 3170 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3171 PetscOptionsEnd(); 3172 } 3173 break; 3174 default: break; 3175 } 3176 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 3177 } 3178 /* dispatch */ 3179 if (isdense) { 3180 switch (product->type) { 3181 case MATPRODUCT_AB: 3182 case MATPRODUCT_AtB: 3183 case MATPRODUCT_ABt: 3184 case MATPRODUCT_PtAP: 3185 case MATPRODUCT_RARt: 3186 if (product->A->boundtocpu) { 3187 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 3188 } else { 3189 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 3190 } 3191 break; 3192 case MATPRODUCT_ABC: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; break; 3193 default: break; 3194 } 3195 } else if (Biscusp && Ciscusp) { 3196 switch (product->type) { 3197 case MATPRODUCT_AB: 3198 case MATPRODUCT_AtB: 3199 case MATPRODUCT_ABt: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; break; 3200 case MATPRODUCT_PtAP: 3201 case MATPRODUCT_RARt: 3202 case MATPRODUCT_ABC: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; break; 3203 default: break; 3204 } 3205 } else { /* fallback for AIJ */ 3206 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 3207 } 3208 PetscFunctionReturn(0); 3209 } 3210 3211 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) { 3212 PetscFunctionBegin; 3213 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE)); 3214 PetscFunctionReturn(0); 3215 } 3216 3217 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) { 3218 PetscFunctionBegin; 3219 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE)); 3220 PetscFunctionReturn(0); 3221 } 3222 3223 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) { 3224 PetscFunctionBegin; 3225 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE)); 3226 PetscFunctionReturn(0); 3227 } 3228 3229 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) { 3230 PetscFunctionBegin; 3231 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE)); 3232 PetscFunctionReturn(0); 3233 } 3234 3235 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) { 3236 PetscFunctionBegin; 3237 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE)); 3238 PetscFunctionReturn(0); 3239 } 3240 3241 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y) { 3242 int i = blockIdx.x * blockDim.x + threadIdx.x; 3243 if (i < n) y[idx[i]] += x[i]; 3244 } 3245 3246 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 3247 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm) { 3248 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3249 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 3250 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3251 PetscScalar *xarray, *zarray, *dptr, *beta, *xptr; 3252 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3253 PetscBool compressed; 3254 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3255 PetscInt nx, ny; 3256 #endif 3257 3258 PetscFunctionBegin; 3259 PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported"); 3260 if (!a->nz) { 3261 if (!yy) PetscCall(VecSet_SeqCUDA(zz, 0)); 3262 else PetscCall(VecCopy_SeqCUDA(yy, zz)); 3263 PetscFunctionReturn(0); 3264 } 3265 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 3266 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3267 if (!trans) { 3268 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3269 PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3270 } else { 3271 if (herm || !A->form_explicit_transpose) { 3272 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3273 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3274 } else { 3275 if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3276 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 3277 } 3278 } 3279 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3280 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3281 3282 try { 3283 PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray)); 3284 if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get uptodate zarray on GPU */ 3285 else PetscCall(VecCUDAGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */ 3286 3287 PetscCall(PetscLogGpuTimeBegin()); 3288 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3289 /* z = A x + beta y. 3290 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3291 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3292 */ 3293 xptr = xarray; 3294 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3295 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3296 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3297 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3298 allocated to accommodate different uses. So we get the length info directly from mat. 3299 */ 3300 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3301 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3302 nx = mat->num_cols; 3303 ny = mat->num_rows; 3304 } 3305 #endif 3306 } else { 3307 /* z = A^T x + beta y 3308 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3309 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3310 */ 3311 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3312 dptr = zarray; 3313 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3314 if (compressed) { /* Scatter x to work vector */ 3315 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3316 thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3317 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse()); 3318 } 3319 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3320 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3321 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3322 nx = mat->num_rows; 3323 ny = mat->num_cols; 3324 } 3325 #endif 3326 } 3327 3328 /* csr_spmv does y = alpha op(A) x + beta y */ 3329 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3330 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3331 PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3332 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3333 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype)); 3334 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype)); 3335 PetscCallCUSPARSE( 3336 cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize)); 3337 PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize)); 3338 3339 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3340 } else { 3341 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3342 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr)); 3343 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr)); 3344 } 3345 3346 PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 3347 matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer)); 3348 #else 3349 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3350 PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr)); 3351 #endif 3352 } else { 3353 if (cusparsestruct->nrows) { 3354 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3355 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3356 #else 3357 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3358 PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr)); 3359 #endif 3360 } 3361 } 3362 PetscCall(PetscLogGpuTimeEnd()); 3363 3364 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3365 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3366 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3367 PetscCall(VecCopy_SeqCUDA(yy, zz)); /* zz = yy */ 3368 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3369 PetscCall(VecAXPY_SeqCUDA(zz, 1.0, yy)); /* zz += yy */ 3370 } 3371 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3372 PetscCall(VecSet_SeqCUDA(zz, 0)); 3373 } 3374 3375 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3376 if (compressed) { 3377 PetscCall(PetscLogGpuTimeBegin()); 3378 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3379 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3380 prevent that. So I just add a ScatterAdd kernel. 3381 */ 3382 #if 0 3383 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3384 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3385 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3386 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3387 VecCUDAPlusEquals()); 3388 #else 3389 PetscInt n = matstruct->cprowIndices->size(); 3390 ScatterAdd<<<(n + 255) / 256, 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray); 3391 #endif 3392 PetscCall(PetscLogGpuTimeEnd()); 3393 } 3394 } else { 3395 if (yy && yy != zz) { PetscCall(VecAXPY_SeqCUDA(zz, 1.0, yy)); /* zz += yy */ } 3396 } 3397 PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray)); 3398 if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray)); 3399 else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray)); 3400 } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); } 3401 if (yy) { 3402 PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 3403 } else { 3404 PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 3405 } 3406 PetscFunctionReturn(0); 3407 } 3408 3409 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) { 3410 PetscFunctionBegin; 3411 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE)); 3412 PetscFunctionReturn(0); 3413 } 3414 3415 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode) { 3416 PetscObjectState onnz = A->nonzerostate; 3417 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 3418 3419 PetscFunctionBegin; 3420 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 3421 if (onnz != A->nonzerostate && cusp->deviceMat) { 3422 PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n")); 3423 PetscCallCUDA(cudaFree(cusp->deviceMat)); 3424 cusp->deviceMat = NULL; 3425 } 3426 PetscFunctionReturn(0); 3427 } 3428 3429 /* --------------------------------------------------------------------------------*/ 3430 /*@ 3431 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format 3432 (the default parallel PETSc format). This matrix will ultimately pushed down 3433 to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix 3434 assembly performance the user should preallocate the matrix storage by setting 3435 the parameter nz (or the array nnz). By setting these parameters accurately, 3436 performance during matrix assembly can be increased by more than a factor of 50. 3437 3438 Collective 3439 3440 Input Parameters: 3441 + comm - MPI communicator, set to `PETSC_COMM_SELF` 3442 . m - number of rows 3443 . n - number of columns 3444 . nz - number of nonzeros per row (same for all rows) 3445 - nnz - array containing the number of nonzeros in the various rows 3446 (possibly different for each row) or NULL 3447 3448 Output Parameter: 3449 . A - the matrix 3450 3451 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 3452 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3453 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 3454 3455 Notes: 3456 If nnz is given then nz is ignored 3457 3458 The AIJ format, also called 3459 compressed row storage, is fully compatible with standard Fortran 77 3460 storage. That is, the stored row and column indices can begin at 3461 either one (as in Fortran) or zero. See the users' manual for details. 3462 3463 Specify the preallocated storage with either nz or nnz (not both). 3464 Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory 3465 allocation. For large problems you MUST preallocate memory or you 3466 will get TERRIBLE performance, see the users' manual chapter on matrices. 3467 3468 By default, this format uses inodes (identical nodes) when possible, to 3469 improve numerical efficiency of matrix-vector products and solves. We 3470 search for consecutive rows with the same nonzero structure, thereby 3471 reusing matrix information to achieve increased efficiency. 3472 3473 Level: intermediate 3474 3475 .seealso: `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE` 3476 @*/ 3477 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) { 3478 PetscFunctionBegin; 3479 PetscCall(MatCreate(comm, A)); 3480 PetscCall(MatSetSizes(*A, m, n, m, n)); 3481 PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE)); 3482 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 3483 PetscFunctionReturn(0); 3484 } 3485 3486 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) { 3487 PetscFunctionBegin; 3488 if (A->factortype == MAT_FACTOR_NONE) { 3489 PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE **)&A->spptr)); 3490 } else { 3491 PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr)); 3492 } 3493 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 3494 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL)); 3495 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL)); 3496 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 3497 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 3498 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 3499 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 3500 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 3501 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 3502 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL)); 3503 PetscCall(MatDestroy_SeqAIJ(A)); 3504 PetscFunctionReturn(0); 3505 } 3506 3507 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *); 3508 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool); 3509 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B) { 3510 PetscFunctionBegin; 3511 PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B)); 3512 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B)); 3513 PetscFunctionReturn(0); 3514 } 3515 3516 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str) { 3517 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 3518 Mat_SeqAIJCUSPARSE *cy; 3519 Mat_SeqAIJCUSPARSE *cx; 3520 PetscScalar *ay; 3521 const PetscScalar *ax; 3522 CsrMatrix *csry, *csrx; 3523 3524 PetscFunctionBegin; 3525 cy = (Mat_SeqAIJCUSPARSE *)Y->spptr; 3526 cx = (Mat_SeqAIJCUSPARSE *)X->spptr; 3527 if (X->ops->axpy != Y->ops->axpy) { 3528 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3529 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3530 PetscFunctionReturn(0); 3531 } 3532 /* if we are here, it means both matrices are bound to GPU */ 3533 PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y)); 3534 PetscCall(MatSeqAIJCUSPARSECopyToGPU(X)); 3535 PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3536 PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3537 csry = (CsrMatrix *)cy->mat->mat; 3538 csrx = (CsrMatrix *)cx->mat->mat; 3539 /* see if we can turn this into a cublas axpy */ 3540 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3541 bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin()); 3542 if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin()); 3543 if (eq) str = SAME_NONZERO_PATTERN; 3544 } 3545 /* spgeam is buggy with one column */ 3546 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3547 3548 if (str == SUBSET_NONZERO_PATTERN) { 3549 PetscScalar b = 1.0; 3550 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3551 size_t bufferSize; 3552 void *buffer; 3553 #endif 3554 3555 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 3556 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3557 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST)); 3558 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3559 PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3560 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize)); 3561 PetscCallCUDA(cudaMalloc(&buffer, bufferSize)); 3562 PetscCall(PetscLogGpuTimeBegin()); 3563 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3564 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer)); 3565 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3566 PetscCall(PetscLogGpuTimeEnd()); 3567 PetscCallCUDA(cudaFree(buffer)); 3568 #else 3569 PetscCall(PetscLogGpuTimeBegin()); 3570 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3571 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get())); 3572 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3573 PetscCall(PetscLogGpuTimeEnd()); 3574 #endif 3575 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3576 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 3577 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3578 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3579 } else if (str == SAME_NONZERO_PATTERN) { 3580 cublasHandle_t cublasv2handle; 3581 PetscBLASInt one = 1, bnz = 1; 3582 3583 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 3584 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3585 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3586 PetscCall(PetscBLASIntCast(x->nz, &bnz)); 3587 PetscCall(PetscLogGpuTimeBegin()); 3588 PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one)); 3589 PetscCall(PetscLogGpuFlops(2.0 * bnz)); 3590 PetscCall(PetscLogGpuTimeEnd()); 3591 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 3592 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3593 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3594 } else { 3595 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3596 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3597 } 3598 PetscFunctionReturn(0); 3599 } 3600 3601 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a) { 3602 Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data; 3603 PetscScalar *ay; 3604 cublasHandle_t cublasv2handle; 3605 PetscBLASInt one = 1, bnz = 1; 3606 3607 PetscFunctionBegin; 3608 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3609 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3610 PetscCall(PetscBLASIntCast(y->nz, &bnz)); 3611 PetscCall(PetscLogGpuTimeBegin()); 3612 PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one)); 3613 PetscCall(PetscLogGpuFlops(bnz)); 3614 PetscCall(PetscLogGpuTimeEnd()); 3615 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3616 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3617 PetscFunctionReturn(0); 3618 } 3619 3620 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) { 3621 PetscBool both = PETSC_FALSE; 3622 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3623 3624 PetscFunctionBegin; 3625 if (A->factortype == MAT_FACTOR_NONE) { 3626 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr; 3627 if (spptr->mat) { 3628 CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat; 3629 if (matrix->values) { 3630 both = PETSC_TRUE; 3631 thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3632 } 3633 } 3634 if (spptr->matTranspose) { 3635 CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat; 3636 if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3637 } 3638 } 3639 PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n])); 3640 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 3641 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3642 else A->offloadmask = PETSC_OFFLOAD_CPU; 3643 PetscFunctionReturn(0); 3644 } 3645 3646 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg) { 3647 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3648 3649 PetscFunctionBegin; 3650 if (A->factortype != MAT_FACTOR_NONE) { 3651 A->boundtocpu = flg; 3652 PetscFunctionReturn(0); 3653 } 3654 if (flg) { 3655 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 3656 3657 A->ops->scale = MatScale_SeqAIJ; 3658 A->ops->axpy = MatAXPY_SeqAIJ; 3659 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3660 A->ops->mult = MatMult_SeqAIJ; 3661 A->ops->multadd = MatMultAdd_SeqAIJ; 3662 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3663 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3664 A->ops->multhermitiantranspose = NULL; 3665 A->ops->multhermitiantransposeadd = NULL; 3666 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3667 PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps))); 3668 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 3669 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 3670 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 3671 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 3672 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 3673 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 3674 } else { 3675 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3676 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3677 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3678 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3679 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3680 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3681 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3682 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3683 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3684 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3685 a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 3686 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 3687 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 3688 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 3689 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 3690 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 3691 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE; 3692 3693 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE)); 3694 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 3695 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 3696 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE)); 3697 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE)); 3698 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 3699 } 3700 A->boundtocpu = flg; 3701 if (flg && a->inode.size) { 3702 a->inode.use = PETSC_TRUE; 3703 } else { 3704 a->inode.use = PETSC_FALSE; 3705 } 3706 PetscFunctionReturn(0); 3707 } 3708 3709 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) { 3710 Mat B; 3711 3712 PetscFunctionBegin; 3713 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */ 3714 if (reuse == MAT_INITIAL_MATRIX) { 3715 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); 3716 } else if (reuse == MAT_REUSE_MATRIX) { 3717 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); 3718 } 3719 B = *newmat; 3720 3721 PetscCall(PetscFree(B->defaultvectype)); 3722 PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype)); 3723 3724 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3725 if (B->factortype == MAT_FACTOR_NONE) { 3726 Mat_SeqAIJCUSPARSE *spptr; 3727 PetscCall(PetscNew(&spptr)); 3728 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 3729 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 3730 spptr->format = MAT_CUSPARSE_CSR; 3731 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3732 #if CUSPARSE_VERSION > 11301 3733 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 3734 #else 3735 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3736 #endif 3737 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3738 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3739 #endif 3740 B->spptr = spptr; 3741 } else { 3742 Mat_SeqAIJCUSPARSETriFactors *spptr; 3743 3744 PetscCall(PetscNew(&spptr)); 3745 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 3746 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 3747 B->spptr = spptr; 3748 } 3749 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3750 } 3751 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3752 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3753 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 3754 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3755 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3756 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3757 3758 PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE)); 3759 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE)); 3760 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE)); 3761 #if defined(PETSC_HAVE_HYPRE) 3762 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE)); 3763 #endif 3764 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE)); 3765 PetscFunctionReturn(0); 3766 } 3767 3768 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) { 3769 PetscFunctionBegin; 3770 PetscCall(MatCreate_SeqAIJ(B)); 3771 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B)); 3772 PetscFunctionReturn(0); 3773 } 3774 3775 /*MC 3776 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3777 3778 A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either 3779 CSR, ELL, or Hybrid format. 3780 All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library. 3781 3782 Options Database Keys: 3783 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()` 3784 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`) during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid). 3785 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`) during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid). 3786 + -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU 3787 3788 Level: beginner 3789 3790 .seealso: `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 3791 M*/ 3792 3793 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat, MatFactorType, Mat *); 3794 3795 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) { 3796 PetscFunctionBegin; 3797 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse_band)); 3798 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse)); 3799 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse)); 3800 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse)); 3801 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse)); 3802 3803 PetscFunctionReturn(0); 3804 } 3805 3806 static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat) { 3807 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)mat->spptr; 3808 3809 PetscFunctionBegin; 3810 if (!cusp) PetscFunctionReturn(0); 3811 delete cusp->cooPerm; 3812 delete cusp->cooPerm_a; 3813 cusp->cooPerm = NULL; 3814 cusp->cooPerm_a = NULL; 3815 if (cusp->use_extended_coo) { 3816 PetscCallCUDA(cudaFree(cusp->jmap_d)); 3817 PetscCallCUDA(cudaFree(cusp->perm_d)); 3818 } 3819 cusp->use_extended_coo = PETSC_FALSE; 3820 PetscFunctionReturn(0); 3821 } 3822 3823 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) { 3824 PetscFunctionBegin; 3825 if (*cusparsestruct) { 3826 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat, (*cusparsestruct)->format)); 3827 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose, (*cusparsestruct)->format)); 3828 delete (*cusparsestruct)->workVector; 3829 delete (*cusparsestruct)->rowoffsets_gpu; 3830 delete (*cusparsestruct)->cooPerm; 3831 delete (*cusparsestruct)->cooPerm_a; 3832 delete (*cusparsestruct)->csr2csc_i; 3833 if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle)); 3834 if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d)); 3835 if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d)); 3836 PetscCall(PetscFree(*cusparsestruct)); 3837 } 3838 PetscFunctionReturn(0); 3839 } 3840 3841 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) { 3842 PetscFunctionBegin; 3843 if (*mat) { 3844 delete (*mat)->values; 3845 delete (*mat)->column_indices; 3846 delete (*mat)->row_offsets; 3847 delete *mat; 3848 *mat = 0; 3849 } 3850 PetscFunctionReturn(0); 3851 } 3852 3853 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) { 3854 PetscFunctionBegin; 3855 if (*trifactor) { 3856 if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr)); 3857 if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo)); 3858 PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 3859 if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer)); 3860 if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h)); 3861 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3862 if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer)); 3863 #endif 3864 PetscCall(PetscFree(*trifactor)); 3865 } 3866 PetscFunctionReturn(0); 3867 } 3868 3869 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format) { 3870 CsrMatrix *mat; 3871 3872 PetscFunctionBegin; 3873 if (*matstruct) { 3874 if ((*matstruct)->mat) { 3875 if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) { 3876 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3877 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3878 #else 3879 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3880 PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat)); 3881 #endif 3882 } else { 3883 mat = (CsrMatrix *)(*matstruct)->mat; 3884 CsrMatrix_Destroy(&mat); 3885 } 3886 } 3887 if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr)); 3888 delete (*matstruct)->cprowIndices; 3889 if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one)); 3890 if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero)); 3891 if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one)); 3892 3893 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3894 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3895 if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr)); 3896 for (int i = 0; i < 3; i++) { 3897 if (mdata->cuSpMV[i].initialized) { 3898 PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer)); 3899 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr)); 3900 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr)); 3901 } 3902 } 3903 #endif 3904 delete *matstruct; 3905 *matstruct = NULL; 3906 } 3907 PetscFunctionReturn(0); 3908 } 3909 3910 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors) { 3911 Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors; 3912 3913 PetscFunctionBegin; 3914 if (fs) { 3915 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr)); 3916 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr)); 3917 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose)); 3918 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose)); 3919 delete fs->rpermIndices; 3920 delete fs->cpermIndices; 3921 delete fs->workVector; 3922 fs->rpermIndices = NULL; 3923 fs->cpermIndices = NULL; 3924 fs->workVector = NULL; 3925 if (fs->a_band_d) PetscCallCUDA(cudaFree(fs->a_band_d)); 3926 if (fs->i_band_d) PetscCallCUDA(cudaFree(fs->i_band_d)); 3927 fs->init_dev_prop = PETSC_FALSE; 3928 #if CUSPARSE_VERSION >= 11500 3929 PetscCallCUDA(cudaFree(fs->csrRowPtr)); 3930 PetscCallCUDA(cudaFree(fs->csrColIdx)); 3931 PetscCallCUDA(cudaFree(fs->csrVal)); 3932 PetscCallCUDA(cudaFree(fs->X)); 3933 PetscCallCUDA(cudaFree(fs->Y)); 3934 // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */ 3935 PetscCallCUDA(cudaFree(fs->spsvBuffer_L)); 3936 PetscCallCUDA(cudaFree(fs->spsvBuffer_U)); 3937 PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt)); 3938 PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut)); 3939 PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M)); 3940 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L)); 3941 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U)); 3942 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L)); 3943 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt)); 3944 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U)); 3945 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut)); 3946 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X)); 3947 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y)); 3948 PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M)); 3949 PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M)); 3950 3951 fs->createdTransposeSpSVDescr = PETSC_FALSE; 3952 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 3953 #endif 3954 } 3955 PetscFunctionReturn(0); 3956 } 3957 3958 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors) { 3959 cusparseHandle_t handle; 3960 3961 PetscFunctionBegin; 3962 if (*trifactors) { 3963 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors)); 3964 if (handle = (*trifactors)->handle) PetscCallCUSPARSE(cusparseDestroy(handle)); 3965 PetscCall(PetscFree(*trifactors)); 3966 } 3967 PetscFunctionReturn(0); 3968 } 3969 3970 struct IJCompare { 3971 __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) { 3972 if (t1.get<0>() < t2.get<0>()) return true; 3973 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3974 return false; 3975 } 3976 }; 3977 3978 struct IJEqual { 3979 __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) { 3980 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3981 return true; 3982 } 3983 }; 3984 3985 struct IJDiff { 3986 __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 == t2 ? 0 : 1; } 3987 }; 3988 3989 struct IJSum { 3990 __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 || t2; } 3991 }; 3992 3993 #include <thrust/iterator/discard_iterator.h> 3994 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */ 3995 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) { 3996 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 3997 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3998 THRUSTARRAY *cooPerm_v = NULL; 3999 thrust::device_ptr<const PetscScalar> d_v; 4000 CsrMatrix *matrix; 4001 PetscInt n; 4002 4003 PetscFunctionBegin; 4004 PetscCheck(cusp, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE struct"); 4005 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE CsrMatrix"); 4006 if (!cusp->cooPerm) { 4007 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 4008 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 4009 PetscFunctionReturn(0); 4010 } 4011 matrix = (CsrMatrix *)cusp->mat->mat; 4012 PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4013 if (!v) { 4014 if (imode == INSERT_VALUES) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 4015 goto finalize; 4016 } 4017 n = cusp->cooPerm->size(); 4018 if (isCudaMem(v)) { 4019 d_v = thrust::device_pointer_cast(v); 4020 } else { 4021 cooPerm_v = new THRUSTARRAY(n); 4022 cooPerm_v->assign(v, v + n); 4023 d_v = cooPerm_v->data(); 4024 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 4025 } 4026 PetscCall(PetscLogGpuTimeBegin()); 4027 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 4028 if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 4029 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 4030 auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()); 4031 /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 4032 cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 4033 cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 4034 */ 4035 thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), cooPerm_w->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>()); 4036 thrust::transform(cooPerm_w->begin(), cooPerm_w->end(), matrix->values->begin(), matrix->values->begin(), thrust::plus<PetscScalar>()); 4037 delete cooPerm_w; 4038 } else { 4039 /* all nonzeros in d_v[] are unique entries */ 4040 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin())); 4041 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end())); 4042 thrust::for_each(zibit, zieit, VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 4043 } 4044 } else { 4045 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 4046 auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()); 4047 thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), matrix->values->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>()); 4048 } else { 4049 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin())); 4050 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end())); 4051 thrust::for_each(zibit, zieit, VecCUDAEquals()); 4052 } 4053 } 4054 PetscCall(PetscLogGpuTimeEnd()); 4055 finalize: 4056 delete cooPerm_v; 4057 A->offloadmask = PETSC_OFFLOAD_GPU; 4058 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4059 /* shorter version of MatAssemblyEnd_SeqAIJ */ 4060 PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", A->rmap->n, A->cmap->n, a->nz)); 4061 PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is 0\n")); 4062 PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rmax)); 4063 a->reallocs = 0; 4064 A->info.mallocs += 0; 4065 A->info.nz_unneeded = 0; 4066 A->assembled = A->was_assembled = PETSC_TRUE; 4067 A->num_ass++; 4068 PetscFunctionReturn(0); 4069 } 4070 4071 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) { 4072 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4073 4074 PetscFunctionBegin; 4075 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4076 if (!cusp) PetscFunctionReturn(0); 4077 if (destroy) { 4078 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 4079 delete cusp->csr2csc_i; 4080 cusp->csr2csc_i = NULL; 4081 } 4082 A->transupdated = PETSC_FALSE; 4083 PetscFunctionReturn(0); 4084 } 4085 4086 #include <thrust/binary_search.h> 4087 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */ 4088 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, PetscInt coo_i[], PetscInt coo_j[]) { 4089 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4090 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4091 PetscInt cooPerm_n, nzr = 0; 4092 4093 PetscFunctionBegin; 4094 PetscCall(PetscLayoutSetUp(A->rmap)); 4095 PetscCall(PetscLayoutSetUp(A->cmap)); 4096 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 4097 if (n != cooPerm_n) { 4098 delete cusp->cooPerm; 4099 delete cusp->cooPerm_a; 4100 cusp->cooPerm = NULL; 4101 cusp->cooPerm_a = NULL; 4102 } 4103 if (n) { 4104 thrust::device_ptr<PetscInt> d_i, d_j; 4105 PetscInt *d_raw_i, *d_raw_j; 4106 PetscBool free_raw_i = PETSC_FALSE, free_raw_j = PETSC_FALSE; 4107 PetscMemType imtype, jmtype; 4108 4109 PetscCall(PetscGetMemType(coo_i, &imtype)); 4110 if (PetscMemTypeHost(imtype)) { 4111 PetscCallCUDA(cudaMalloc(&d_raw_i, sizeof(PetscInt) * n)); 4112 PetscCallCUDA(cudaMemcpy(d_raw_i, coo_i, sizeof(PetscInt) * n, cudaMemcpyHostToDevice)); 4113 d_i = thrust::device_pointer_cast(d_raw_i); 4114 free_raw_i = PETSC_TRUE; 4115 PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt))); 4116 } else { 4117 d_i = thrust::device_pointer_cast(coo_i); 4118 } 4119 4120 PetscCall(PetscGetMemType(coo_j, &jmtype)); 4121 if (PetscMemTypeHost(jmtype)) { // MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic() passes device coo_i[] and host coo_j[]! 4122 PetscCallCUDA(cudaMalloc(&d_raw_j, sizeof(PetscInt) * n)); 4123 PetscCallCUDA(cudaMemcpy(d_raw_j, coo_j, sizeof(PetscInt) * n, cudaMemcpyHostToDevice)); 4124 d_j = thrust::device_pointer_cast(d_raw_j); 4125 free_raw_j = PETSC_TRUE; 4126 PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt))); 4127 } else { 4128 d_j = thrust::device_pointer_cast(coo_j); 4129 } 4130 4131 THRUSTINTARRAY ii(A->rmap->n); 4132 4133 if (!cusp->cooPerm) cusp->cooPerm = new THRUSTINTARRAY(n); 4134 if (!cusp->cooPerm_a) cusp->cooPerm_a = new THRUSTINTARRAY(n); 4135 4136 /* Ex. 4137 n = 6 4138 coo_i = [3,3,1,4,1,4] 4139 coo_j = [3,2,2,5,2,6] 4140 */ 4141 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i, d_j)); 4142 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i + n, d_j + n)); 4143 4144 PetscCall(PetscLogGpuTimeBegin()); 4145 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 4146 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 4147 (*cusp->cooPerm_a).assign(d_i, d_i + n); /* copy the sorted array */ 4148 THRUSTINTARRAY w(d_j, d_j + n); 4149 4150 /* 4151 d_i = [1,1,3,3,4,4] 4152 d_j = [2,2,2,3,5,6] 4153 cooPerm = [2,4,1,0,3,5] 4154 */ 4155 auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 4156 4157 /* 4158 d_i = [1,3,3,4,4,x] 4159 ^ekey 4160 d_j = [2,2,3,5,6,x] 4161 ^nekye 4162 */ 4163 if (nekey == ekey) { /* all entries are unique */ 4164 delete cusp->cooPerm_a; 4165 cusp->cooPerm_a = NULL; 4166 } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 4167 /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 4168 adjacent_difference(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 4169 adjacent_difference(w.begin(), w.end(), w.begin(), IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 4170 (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 4171 w[0] = 0; 4172 thrust::transform(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), w.begin(), cusp->cooPerm_a->begin(), IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 4173 thrust::inclusive_scan(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 4174 } 4175 thrust::counting_iterator<PetscInt> search_begin(0); 4176 thrust::upper_bound(d_i, nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 4177 search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 4178 ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 4179 PetscCall(PetscLogGpuTimeEnd()); 4180 4181 PetscCall(MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i)); 4182 a->singlemalloc = PETSC_FALSE; 4183 a->free_a = PETSC_TRUE; 4184 a->free_ij = PETSC_TRUE; 4185 PetscCall(PetscMalloc1(A->rmap->n + 1, &a->i)); 4186 a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 4187 PetscCallCUDA(cudaMemcpy(a->i + 1, ii.data().get(), A->rmap->n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4188 a->nz = a->maxnz = a->i[A->rmap->n]; 4189 a->rmax = 0; 4190 PetscCall(PetscMalloc1(a->nz, &a->a)); 4191 PetscCall(PetscMalloc1(a->nz, &a->j)); 4192 PetscCallCUDA(cudaMemcpy(a->j, thrust::raw_pointer_cast(d_j), a->nz * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4193 if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n, &a->ilen)); 4194 if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n, &a->imax)); 4195 for (PetscInt i = 0; i < A->rmap->n; i++) { 4196 const PetscInt nnzr = a->i[i + 1] - a->i[i]; 4197 nzr += (PetscInt) !!(nnzr); 4198 a->ilen[i] = a->imax[i] = nnzr; 4199 a->rmax = PetscMax(a->rmax, nnzr); 4200 } 4201 a->nonzerorowcnt = nzr; 4202 A->preallocated = PETSC_TRUE; 4203 PetscCall(PetscLogGpuToCpu((A->rmap->n + a->nz) * sizeof(PetscInt))); 4204 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 4205 if (free_raw_i) PetscCallCUDA(cudaFree(d_raw_i)); 4206 if (free_raw_j) PetscCallCUDA(cudaFree(d_raw_j)); 4207 } else { 4208 PetscCall(MatSeqAIJSetPreallocation(A, 0, NULL)); 4209 } 4210 PetscCall(MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE)); 4211 4212 /* We want to allocate the CUSPARSE struct for matvec now. 4213 The code is so convoluted now that I prefer to copy zeros */ 4214 PetscCall(PetscArrayzero(a->a, a->nz)); 4215 PetscCall(MatCheckCompressedRow(A, nzr, &a->compressedrow, a->i, A->rmap->n, 0.6)); 4216 A->offloadmask = PETSC_OFFLOAD_CPU; 4217 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4218 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 4219 PetscFunctionReturn(0); 4220 } 4221 4222 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) { 4223 Mat_SeqAIJ *seq; 4224 Mat_SeqAIJCUSPARSE *dev; 4225 PetscBool coo_basic = PETSC_TRUE; 4226 PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 4227 4228 PetscFunctionBegin; 4229 PetscCall(MatResetPreallocationCOO_SeqAIJ(mat)); 4230 PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat)); 4231 if (coo_i) { 4232 PetscCall(PetscGetMemType(coo_i, &mtype)); 4233 if (PetscMemTypeHost(mtype)) { 4234 for (PetscCount k = 0; k < coo_n; k++) { 4235 if (coo_i[k] < 0 || coo_j[k] < 0) { 4236 coo_basic = PETSC_FALSE; 4237 break; 4238 } 4239 } 4240 } 4241 } 4242 4243 if (coo_basic) { /* i,j are on device or do not contain negative indices */ 4244 PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j)); 4245 } else { 4246 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 4247 mat->offloadmask = PETSC_OFFLOAD_CPU; 4248 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat)); 4249 seq = static_cast<Mat_SeqAIJ *>(mat->data); 4250 dev = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr); 4251 PetscCallCUDA(cudaMalloc((void **)&dev->jmap_d, (seq->nz + 1) * sizeof(PetscCount))); 4252 PetscCallCUDA(cudaMemcpy(dev->jmap_d, seq->jmap, (seq->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4253 PetscCallCUDA(cudaMalloc((void **)&dev->perm_d, seq->Atot * sizeof(PetscCount))); 4254 PetscCallCUDA(cudaMemcpy(dev->perm_d, seq->perm, seq->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4255 dev->use_extended_coo = PETSC_TRUE; 4256 } 4257 PetscFunctionReturn(0); 4258 } 4259 4260 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[]) { 4261 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 4262 const PetscCount grid_size = gridDim.x * blockDim.x; 4263 for (; i < nnz; i += grid_size) { 4264 PetscScalar sum = 0.0; 4265 for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]]; 4266 a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum; 4267 } 4268 } 4269 4270 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) { 4271 Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data; 4272 Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE *)A->spptr; 4273 PetscCount Annz = seq->nz; 4274 PetscMemType memtype; 4275 const PetscScalar *v1 = v; 4276 PetscScalar *Aa; 4277 4278 PetscFunctionBegin; 4279 if (dev->use_extended_coo) { 4280 PetscCall(PetscGetMemType(v, &memtype)); 4281 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 4282 PetscCallCUDA(cudaMalloc((void **)&v1, seq->coo_n * sizeof(PetscScalar))); 4283 PetscCallCUDA(cudaMemcpy((void *)v1, v, seq->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4284 } 4285 4286 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); 4287 else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); 4288 4289 if (Annz) { 4290 MatAddCOOValues<<<(Annz + 255) / 256, 256>>>(v1, Annz, dev->jmap_d, dev->perm_d, imode, Aa); 4291 PetscCallCUDA(cudaPeekAtLastError()); 4292 } 4293 4294 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa)); 4295 else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa)); 4296 4297 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1)); 4298 } else { 4299 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A, v, imode)); 4300 } 4301 PetscFunctionReturn(0); 4302 } 4303 4304 /*@C 4305 MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for `MATSEQAIJCUSPARSE` matrices. 4306 4307 Not collective 4308 4309 Input Parameters: 4310 + A - the matrix 4311 - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 4312 4313 Output Parameters: 4314 + ia - the CSR row pointers 4315 - ja - the CSR column indices 4316 4317 Level: developer 4318 4319 Note: 4320 When compressed is true, the CSR structure does not contain empty rows 4321 4322 .seealso: `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()` 4323 @*/ 4324 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j) { 4325 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4326 CsrMatrix *csr; 4327 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4328 4329 PetscFunctionBegin; 4330 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4331 if (!i || !j) PetscFunctionReturn(0); 4332 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4333 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4334 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4335 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4336 csr = (CsrMatrix *)cusp->mat->mat; 4337 if (i) { 4338 if (!compressed && a->compressedrow.use) { /* need full row offset */ 4339 if (!cusp->rowoffsets_gpu) { 4340 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4341 cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 4342 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4343 } 4344 *i = cusp->rowoffsets_gpu->data().get(); 4345 } else *i = csr->row_offsets->data().get(); 4346 } 4347 if (j) *j = csr->column_indices->data().get(); 4348 PetscFunctionReturn(0); 4349 } 4350 4351 /*@C 4352 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with `MatSeqAIJCUSPARSEGetIJ()` 4353 4354 Not collective 4355 4356 Input Parameters: 4357 + A - the matrix 4358 - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 4359 4360 Output Parameters: 4361 + ia - the CSR row pointers 4362 - ja - the CSR column indices 4363 4364 Level: developer 4365 4366 .seealso: `MatSeqAIJCUSPARSEGetIJ()` 4367 @*/ 4368 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j) { 4369 PetscFunctionBegin; 4370 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4371 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4372 if (i) *i = NULL; 4373 if (j) *j = NULL; 4374 PetscFunctionReturn(0); 4375 } 4376 4377 /*@C 4378 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4379 4380 Not Collective 4381 4382 Input Parameter: 4383 . A - a `MATSEQAIJCUSPARSE` matrix 4384 4385 Output Parameter: 4386 . a - pointer to the device data 4387 4388 Level: developer 4389 4390 Note: 4391 May trigger host-device copies if up-to-date matrix data is on host 4392 4393 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()` 4394 @*/ 4395 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a) { 4396 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4397 CsrMatrix *csr; 4398 4399 PetscFunctionBegin; 4400 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4401 PetscValidPointer(a, 2); 4402 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4403 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4404 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4405 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4406 csr = (CsrMatrix *)cusp->mat->mat; 4407 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4408 *a = csr->values->data().get(); 4409 PetscFunctionReturn(0); 4410 } 4411 4412 /*@C 4413 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()` 4414 4415 Not Collective 4416 4417 Input Parameter: 4418 . A - a `MATSEQAIJCUSPARSE` matrix 4419 4420 Output Parameter: 4421 . a - pointer to the device data 4422 4423 Level: developer 4424 4425 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()` 4426 @*/ 4427 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a) { 4428 PetscFunctionBegin; 4429 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4430 PetscValidPointer(a, 2); 4431 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4432 *a = NULL; 4433 PetscFunctionReturn(0); 4434 } 4435 4436 /*@C 4437 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4438 4439 Not Collective 4440 4441 Input Parameter: 4442 . A - a `MATSEQAIJCUSPARSE` matrix 4443 4444 Output Parameter: 4445 . a - pointer to the device data 4446 4447 Level: developer 4448 4449 Note: 4450 May trigger host-device copies if up-to-date matrix data is on host 4451 4452 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()` 4453 @*/ 4454 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a) { 4455 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4456 CsrMatrix *csr; 4457 4458 PetscFunctionBegin; 4459 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4460 PetscValidPointer(a, 2); 4461 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4462 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4463 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4464 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4465 csr = (CsrMatrix *)cusp->mat->mat; 4466 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4467 *a = csr->values->data().get(); 4468 A->offloadmask = PETSC_OFFLOAD_GPU; 4469 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4470 PetscFunctionReturn(0); 4471 } 4472 /*@C 4473 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()` 4474 4475 Not Collective 4476 4477 Input Parameter: 4478 . A - a `MATSEQAIJCUSPARSE` matrix 4479 4480 Output Parameter: 4481 . a - pointer to the device data 4482 4483 Level: developer 4484 4485 .seealso: `MatSeqAIJCUSPARSEGetArray()` 4486 @*/ 4487 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a) { 4488 PetscFunctionBegin; 4489 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4490 PetscValidPointer(a, 2); 4491 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4492 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4493 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4494 *a = NULL; 4495 PetscFunctionReturn(0); 4496 } 4497 4498 /*@C 4499 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4500 4501 Not Collective 4502 4503 Input Parameter: 4504 . A - a `MATSEQAIJCUSPARSE` matrix 4505 4506 Output Parameter: 4507 . a - pointer to the device data 4508 4509 Level: developer 4510 4511 Note: 4512 Does not trigger host-device copies and flags data validity on the GPU 4513 4514 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()` 4515 @*/ 4516 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a) { 4517 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4518 CsrMatrix *csr; 4519 4520 PetscFunctionBegin; 4521 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4522 PetscValidPointer(a, 2); 4523 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4524 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4525 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4526 csr = (CsrMatrix *)cusp->mat->mat; 4527 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4528 *a = csr->values->data().get(); 4529 A->offloadmask = PETSC_OFFLOAD_GPU; 4530 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4531 PetscFunctionReturn(0); 4532 } 4533 4534 /*@C 4535 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()` 4536 4537 Not Collective 4538 4539 Input Parameter: 4540 . A - a `MATSEQAIJCUSPARSE` matrix 4541 4542 Output Parameter: 4543 . a - pointer to the device data 4544 4545 Level: developer 4546 4547 .seealso: `MatSeqAIJCUSPARSEGetArrayWrite()` 4548 @*/ 4549 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a) { 4550 PetscFunctionBegin; 4551 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4552 PetscValidPointer(a, 2); 4553 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4554 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4555 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4556 *a = NULL; 4557 PetscFunctionReturn(0); 4558 } 4559 4560 struct IJCompare4 { 4561 __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) { 4562 if (t1.get<0>() < t2.get<0>()) return true; 4563 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4564 return false; 4565 } 4566 }; 4567 4568 struct Shift { 4569 int _shift; 4570 4571 Shift(int shift) : _shift(shift) { } 4572 __host__ __device__ inline int operator()(const int &c) { return c + _shift; } 4573 }; 4574 4575 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 4576 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) { 4577 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 4578 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp; 4579 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4580 CsrMatrix *Acsr, *Bcsr, *Ccsr; 4581 PetscInt Annz, Bnnz; 4582 cusparseStatus_t stat; 4583 PetscInt i, m, n, zero = 0; 4584 4585 PetscFunctionBegin; 4586 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4587 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 4588 PetscValidPointer(C, 4); 4589 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4590 PetscCheckTypeName(B, MATSEQAIJCUSPARSE); 4591 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 4592 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 4593 PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4594 PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4595 if (reuse == MAT_INITIAL_MATRIX) { 4596 m = A->rmap->n; 4597 n = A->cmap->n + B->cmap->n; 4598 PetscCall(MatCreate(PETSC_COMM_SELF, C)); 4599 PetscCall(MatSetSizes(*C, m, n, m, n)); 4600 PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE)); 4601 c = (Mat_SeqAIJ *)(*C)->data; 4602 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4603 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4604 Ccsr = new CsrMatrix; 4605 Cmat->cprowIndices = NULL; 4606 c->compressedrow.use = PETSC_FALSE; 4607 c->compressedrow.nrows = 0; 4608 c->compressedrow.i = NULL; 4609 c->compressedrow.rindex = NULL; 4610 Ccusp->workVector = NULL; 4611 Ccusp->nrows = m; 4612 Ccusp->mat = Cmat; 4613 Ccusp->mat->mat = Ccsr; 4614 Ccsr->num_rows = m; 4615 Ccsr->num_cols = n; 4616 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 4617 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 4618 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4619 PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar))); 4620 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar))); 4621 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 4622 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4623 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4624 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4625 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4626 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4627 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4628 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4629 4630 Acsr = (CsrMatrix *)Acusp->mat->mat; 4631 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4632 Annz = (PetscInt)Acsr->column_indices->size(); 4633 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4634 c->nz = Annz + Bnnz; 4635 Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1); 4636 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4637 Ccsr->values = new THRUSTARRAY(c->nz); 4638 Ccsr->num_entries = c->nz; 4639 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4640 if (c->nz) { 4641 auto Acoo = new THRUSTINTARRAY32(Annz); 4642 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4643 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4644 THRUSTINTARRAY32 *Aroff, *Broff; 4645 4646 if (a->compressedrow.use) { /* need full row offset */ 4647 if (!Acusp->rowoffsets_gpu) { 4648 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4649 Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 4650 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4651 } 4652 Aroff = Acusp->rowoffsets_gpu; 4653 } else Aroff = Acsr->row_offsets; 4654 if (b->compressedrow.use) { /* need full row offset */ 4655 if (!Bcusp->rowoffsets_gpu) { 4656 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4657 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 4658 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 4659 } 4660 Broff = Bcusp->rowoffsets_gpu; 4661 } else Broff = Bcsr->row_offsets; 4662 PetscCall(PetscLogGpuTimeBegin()); 4663 stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4664 PetscCallCUSPARSE(stat); 4665 stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4666 PetscCallCUSPARSE(stat); 4667 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4668 auto Aperm = thrust::make_constant_iterator(1); 4669 auto Bperm = thrust::make_constant_iterator(0); 4670 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0) 4671 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n)); 4672 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n)); 4673 #else 4674 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4675 auto Bcib = Bcsr->column_indices->begin(); 4676 auto Bcie = Bcsr->column_indices->end(); 4677 thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n)); 4678 #endif 4679 auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz); 4680 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm)); 4681 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm)); 4682 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm)); 4683 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm)); 4684 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin())); 4685 auto p1 = Ccusp->cooPerm->begin(); 4686 auto p2 = Ccusp->cooPerm->begin(); 4687 thrust::advance(p2, Annz); 4688 PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4())); 4689 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0) 4690 thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n)); 4691 #endif 4692 auto cci = thrust::make_counting_iterator(zero); 4693 auto cce = thrust::make_counting_iterator(c->nz); 4694 #if 0 //Errors on SUMMIT cuda 11.1.0 4695 PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4696 #else 4697 auto pred = thrust::identity<int>(); 4698 PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred)); 4699 PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred)); 4700 #endif 4701 stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4702 PetscCallCUSPARSE(stat); 4703 PetscCall(PetscLogGpuTimeEnd()); 4704 delete wPerm; 4705 delete Acoo; 4706 delete Bcoo; 4707 delete Ccoo; 4708 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4709 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 4710 PetscCallCUSPARSE(stat); 4711 #endif 4712 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4713 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 4714 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 4715 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4716 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4717 CsrMatrix *CcsrT = new CsrMatrix; 4718 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4719 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4720 4721 (*C)->form_explicit_transpose = PETSC_TRUE; 4722 (*C)->transupdated = PETSC_TRUE; 4723 Ccusp->rowoffsets_gpu = NULL; 4724 CmatT->cprowIndices = NULL; 4725 CmatT->mat = CcsrT; 4726 CcsrT->num_rows = n; 4727 CcsrT->num_cols = m; 4728 CcsrT->num_entries = c->nz; 4729 4730 CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1); 4731 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4732 CcsrT->values = new THRUSTARRAY(c->nz); 4733 4734 PetscCall(PetscLogGpuTimeBegin()); 4735 auto rT = CcsrT->row_offsets->begin(); 4736 if (AT) { 4737 rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT); 4738 thrust::advance(rT, -1); 4739 } 4740 if (BT) { 4741 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz)); 4742 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz)); 4743 thrust::copy(titb, tite, rT); 4744 } 4745 auto cT = CcsrT->column_indices->begin(); 4746 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT); 4747 if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT); 4748 auto vT = CcsrT->values->begin(); 4749 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4750 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 4751 PetscCall(PetscLogGpuTimeEnd()); 4752 4753 PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr)); 4754 PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO)); 4755 PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4756 PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one), sizeof(PetscScalar))); 4757 PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero), sizeof(PetscScalar))); 4758 PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar))); 4759 PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4760 PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4761 PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4762 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4763 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 4764 PetscCallCUSPARSE(stat); 4765 #endif 4766 Ccusp->matTranspose = CmatT; 4767 } 4768 } 4769 4770 c->singlemalloc = PETSC_FALSE; 4771 c->free_a = PETSC_TRUE; 4772 c->free_ij = PETSC_TRUE; 4773 PetscCall(PetscMalloc1(m + 1, &c->i)); 4774 PetscCall(PetscMalloc1(c->nz, &c->j)); 4775 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4776 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4777 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4778 ii = *Ccsr->row_offsets; 4779 jj = *Ccsr->column_indices; 4780 PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4781 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4782 } else { 4783 PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4784 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4785 } 4786 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 4787 PetscCall(PetscMalloc1(m, &c->ilen)); 4788 PetscCall(PetscMalloc1(m, &c->imax)); 4789 c->maxnz = c->nz; 4790 c->nonzerorowcnt = 0; 4791 c->rmax = 0; 4792 for (i = 0; i < m; i++) { 4793 const PetscInt nn = c->i[i + 1] - c->i[i]; 4794 c->ilen[i] = c->imax[i] = nn; 4795 c->nonzerorowcnt += (PetscInt) !!nn; 4796 c->rmax = PetscMax(c->rmax, nn); 4797 } 4798 PetscCall(MatMarkDiagonal_SeqAIJ(*C)); 4799 PetscCall(PetscMalloc1(c->nz, &c->a)); 4800 (*C)->nonzerostate++; 4801 PetscCall(PetscLayoutSetUp((*C)->rmap)); 4802 PetscCall(PetscLayoutSetUp((*C)->cmap)); 4803 Ccusp->nonzerostate = (*C)->nonzerostate; 4804 (*C)->preallocated = PETSC_TRUE; 4805 } else { 4806 PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n); 4807 c = (Mat_SeqAIJ *)(*C)->data; 4808 if (c->nz) { 4809 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4810 PetscCheck(Ccusp->cooPerm, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cooPerm"); 4811 PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4812 PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate"); 4813 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4814 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4815 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4816 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4817 Acsr = (CsrMatrix *)Acusp->mat->mat; 4818 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4819 Ccsr = (CsrMatrix *)Ccusp->mat->mat; 4820 PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size()); 4821 PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size()); 4822 PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size()); 4823 PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries); 4824 PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->cooPerm->size(), (PetscInt)Ccsr->values->size()); 4825 auto pmid = Ccusp->cooPerm->begin(); 4826 thrust::advance(pmid, Acsr->num_entries); 4827 PetscCall(PetscLogGpuTimeBegin()); 4828 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->begin()))); 4829 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4830 thrust::for_each(zibait, zieait, VecCUDAEquals()); 4831 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4832 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->end()))); 4833 thrust::for_each(zibbit, ziebit, VecCUDAEquals()); 4834 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE)); 4835 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4836 PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4837 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4838 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4839 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4840 CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat; 4841 auto vT = CcsrT->values->begin(); 4842 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4843 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 4844 (*C)->transupdated = PETSC_TRUE; 4845 } 4846 PetscCall(PetscLogGpuTimeEnd()); 4847 } 4848 } 4849 PetscCall(PetscObjectStateIncrease((PetscObject)*C)); 4850 (*C)->assembled = PETSC_TRUE; 4851 (*C)->was_assembled = PETSC_FALSE; 4852 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4853 PetscFunctionReturn(0); 4854 } 4855 4856 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) { 4857 bool dmem; 4858 const PetscScalar *av; 4859 4860 PetscFunctionBegin; 4861 dmem = isCudaMem(v); 4862 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av)); 4863 if (n && idx) { 4864 THRUSTINTARRAY widx(n); 4865 widx.assign(idx, idx + n); 4866 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 4867 4868 THRUSTARRAY *w = NULL; 4869 thrust::device_ptr<PetscScalar> dv; 4870 if (dmem) { 4871 dv = thrust::device_pointer_cast(v); 4872 } else { 4873 w = new THRUSTARRAY(n); 4874 dv = w->data(); 4875 } 4876 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4877 4878 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv)); 4879 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n)); 4880 thrust::for_each(zibit, zieit, VecCUDAEquals()); 4881 if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 4882 delete w; 4883 } else { 4884 PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost)); 4885 } 4886 if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 4887 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av)); 4888 PetscFunctionReturn(0); 4889 } 4890