1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 7 8 #include <petscconf.h> 9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10 #include <../src/mat/impls/sbaij/seq/sbaij.h> 11 #include <../src/vec/vec/impls/dvecimpl.h> 12 #include <petsc/private/vecimpl.h> 13 #undef VecType 14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15 #include <thrust/adjacent_difference.h> 16 #include <thrust/async/for_each.h> 17 #include <thrust/iterator/constant_iterator.h> 18 #include <thrust/remove.h> 19 #include <thrust/sort.h> 20 #include <thrust/unique.h> 21 22 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0}; 23 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 24 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 25 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 26 27 typedef enum { 28 CUSPARSE_MV_ALG_DEFAULT = 0, 29 CUSPARSE_COOMV_ALG = 1, 30 CUSPARSE_CSRMV_ALG1 = 2, 31 CUSPARSE_CSRMV_ALG2 = 3 32 } cusparseSpMVAlg_t; 33 34 typedef enum { 35 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 36 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 37 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 38 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 39 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 40 CUSPARSE_SPMM_ALG_DEFAULT = 0, 41 CUSPARSE_SPMM_COO_ALG1 = 1, 42 CUSPARSE_SPMM_COO_ALG2 = 2, 43 CUSPARSE_SPMM_COO_ALG3 = 3, 44 CUSPARSE_SPMM_COO_ALG4 = 5, 45 CUSPARSE_SPMM_CSR_ALG1 = 4, 46 CUSPARSE_SPMM_CSR_ALG2 = 6, 47 } cusparseSpMMAlg_t; 48 49 typedef enum { 50 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 51 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 52 } cusparseCsr2CscAlg_t; 53 */ 54 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0}; 55 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0}; 56 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0}; 57 #endif 58 59 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 60 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 61 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *); 62 63 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *); 64 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *); 65 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *); 66 67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec); 68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 71 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject); 72 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure); 73 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar); 74 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec); 75 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 76 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 77 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 78 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 79 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 80 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool); 81 82 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **); 83 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **); 84 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat); 85 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **); 86 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **); 87 88 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 89 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool); 90 91 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]); 92 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]); 93 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode); 94 95 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) { 96 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 97 98 PetscFunctionBegin; 99 switch (op) { 100 case MAT_CUSPARSE_MULT: cusparsestruct->format = format; break; 101 case MAT_CUSPARSE_ALL: cusparsestruct->format = format; break; 102 default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op); 103 } 104 PetscFunctionReturn(0); 105 } 106 107 /*@ 108 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 109 operation. Only the MatMult operation can use different GPU storage formats 110 for MPIAIJCUSPARSE matrices. 111 Not Collective 112 113 Input Parameters: 114 + A - Matrix of type SEQAIJCUSPARSE 115 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 116 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 117 118 Output Parameter: 119 120 Level: intermediate 121 122 .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 123 @*/ 124 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) { 125 PetscFunctionBegin; 126 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 127 PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format)); 128 PetscFunctionReturn(0); 129 } 130 131 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu) { 132 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 133 134 PetscFunctionBegin; 135 cusparsestruct->use_cpu_solve = use_cpu; 136 PetscFunctionReturn(0); 137 } 138 139 /*@ 140 MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve. 141 142 Input Parameters: 143 + A - Matrix of type SEQAIJCUSPARSE 144 - use_cpu - set flag for using the built-in CPU MatSolve 145 146 Output Parameter: 147 148 Notes: 149 The cuSparse LU solver currently computes the factors with the built-in CPU method 150 and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 151 This method to specify if the solve is done on the CPU or GPU (GPU is the default). 152 153 Level: intermediate 154 155 .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 156 @*/ 157 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu) { 158 PetscFunctionBegin; 159 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 160 PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu)); 161 PetscFunctionReturn(0); 162 } 163 164 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg) { 165 PetscFunctionBegin; 166 switch (op) { 167 case MAT_FORM_EXPLICIT_TRANSPOSE: 168 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 169 if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 170 A->form_explicit_transpose = flg; 171 break; 172 default: PetscCall(MatSetOption_SeqAIJ(A, op, flg)); break; 173 } 174 PetscFunctionReturn(0); 175 } 176 177 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 178 179 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) { 180 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 181 IS isrow = b->row, iscol = b->col; 182 PetscBool row_identity, col_identity; 183 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)B->spptr; 184 185 PetscFunctionBegin; 186 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 187 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 188 B->offloadmask = PETSC_OFFLOAD_CPU; 189 /* determine which version of MatSolve needs to be used. */ 190 PetscCall(ISIdentity(isrow, &row_identity)); 191 PetscCall(ISIdentity(iscol, &col_identity)); 192 193 if (!cusparsestruct->use_cpu_solve) { 194 if (row_identity && col_identity) { 195 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 196 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 197 } else { 198 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 199 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 200 } 201 } 202 B->ops->matsolve = NULL; 203 B->ops->matsolvetranspose = NULL; 204 205 /* get the triangular factors */ 206 if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B)); 207 PetscFunctionReturn(0); 208 } 209 210 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject) { 211 MatCUSPARSEStorageFormat format; 212 PetscBool flg; 213 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 214 215 PetscFunctionBegin; 216 PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options"); 217 if (A->factortype == MAT_FACTOR_NONE) { 218 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 219 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format)); 220 221 PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 222 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format)); 223 PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg)); 224 if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve)); 225 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 226 PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg)); 227 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 228 #if CUSPARSE_VERSION > 11301 229 PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 230 #else 231 PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 232 #endif 233 PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg)); 234 PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 235 236 PetscCall( 237 PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg)); 238 PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 239 #endif 240 } 241 PetscOptionsHeadEnd(); 242 PetscFunctionReturn(0); 243 } 244 245 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) { 246 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 247 PetscInt n = A->rmap->n; 248 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 249 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 250 const PetscInt *ai = a->i, *aj = a->j, *vi; 251 const MatScalar *aa = a->a, *v; 252 PetscInt *AiLo, *AjLo; 253 PetscInt i, nz, nzLower, offset, rowOffset; 254 255 PetscFunctionBegin; 256 if (!n) PetscFunctionReturn(0); 257 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 258 try { 259 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 260 nzLower = n + ai[n] - ai[1]; 261 if (!loTriFactor) { 262 PetscScalar *AALo; 263 264 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar))); 265 266 /* Allocate Space for the lower triangular matrix */ 267 PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt))); 268 PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt))); 269 270 /* Fill the lower triangular matrix */ 271 AiLo[0] = (PetscInt)0; 272 AiLo[n] = nzLower; 273 AjLo[0] = (PetscInt)0; 274 AALo[0] = (MatScalar)1.0; 275 v = aa; 276 vi = aj; 277 offset = 1; 278 rowOffset = 1; 279 for (i = 1; i < n; i++) { 280 nz = ai[i + 1] - ai[i]; 281 /* additional 1 for the term on the diagonal */ 282 AiLo[i] = rowOffset; 283 rowOffset += nz + 1; 284 285 PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz)); 286 PetscCall(PetscArraycpy(&(AALo[offset]), v, nz)); 287 288 offset += nz; 289 AjLo[offset] = (PetscInt)i; 290 AALo[offset] = (MatScalar)1.0; 291 offset += 1; 292 293 v += nz; 294 vi += nz; 295 } 296 297 /* allocate space for the triangular factor information */ 298 PetscCall(PetscNew(&loTriFactor)); 299 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 300 /* Create the matrix description */ 301 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 302 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 303 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 304 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 305 #else 306 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 307 #endif 308 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER)); 309 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 310 311 /* set the operation */ 312 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 313 314 /* set the matrix */ 315 loTriFactor->csrMat = new CsrMatrix; 316 loTriFactor->csrMat->num_rows = n; 317 loTriFactor->csrMat->num_cols = n; 318 loTriFactor->csrMat->num_entries = nzLower; 319 320 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 321 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1); 322 323 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 324 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower); 325 326 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 327 loTriFactor->csrMat->values->assign(AALo, AALo + nzLower); 328 329 /* Create the solve analysis information */ 330 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 331 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 332 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 333 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 334 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 335 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 336 #endif 337 338 /* perform the solve analysis */ 339 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 340 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), 341 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 342 loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 343 #else 344 loTriFactor->solveInfo)); 345 #endif 346 PetscCallCUDA(WaitForCUDA()); 347 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 348 349 /* assign the pointer */ 350 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 351 loTriFactor->AA_h = AALo; 352 PetscCallCUDA(cudaFreeHost(AiLo)); 353 PetscCallCUDA(cudaFreeHost(AjLo)); 354 PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar))); 355 } else { /* update values only */ 356 if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar))); 357 /* Fill the lower triangular matrix */ 358 loTriFactor->AA_h[0] = 1.0; 359 v = aa; 360 vi = aj; 361 offset = 1; 362 for (i = 1; i < n; i++) { 363 nz = ai[i + 1] - ai[i]; 364 PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz)); 365 offset += nz; 366 loTriFactor->AA_h[offset] = 1.0; 367 offset += 1; 368 v += nz; 369 } 370 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower); 371 PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar))); 372 } 373 } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); } 374 } 375 PetscFunctionReturn(0); 376 } 377 378 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) { 379 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 380 PetscInt n = A->rmap->n; 381 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 382 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 383 const PetscInt *aj = a->j, *adiag = a->diag, *vi; 384 const MatScalar *aa = a->a, *v; 385 PetscInt *AiUp, *AjUp; 386 PetscInt i, nz, nzUpper, offset; 387 388 PetscFunctionBegin; 389 if (!n) PetscFunctionReturn(0); 390 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 391 try { 392 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 393 nzUpper = adiag[0] - adiag[n]; 394 if (!upTriFactor) { 395 PetscScalar *AAUp; 396 397 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 398 399 /* Allocate Space for the upper triangular matrix */ 400 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 401 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 402 403 /* Fill the upper triangular matrix */ 404 AiUp[0] = (PetscInt)0; 405 AiUp[n] = nzUpper; 406 offset = nzUpper; 407 for (i = n - 1; i >= 0; i--) { 408 v = aa + adiag[i + 1] + 1; 409 vi = aj + adiag[i + 1] + 1; 410 411 /* number of elements NOT on the diagonal */ 412 nz = adiag[i] - adiag[i + 1] - 1; 413 414 /* decrement the offset */ 415 offset -= (nz + 1); 416 417 /* first, set the diagonal elements */ 418 AjUp[offset] = (PetscInt)i; 419 AAUp[offset] = (MatScalar)1. / v[nz]; 420 AiUp[i] = AiUp[i + 1] - (nz + 1); 421 422 PetscCall(PetscArraycpy(&(AjUp[offset + 1]), vi, nz)); 423 PetscCall(PetscArraycpy(&(AAUp[offset + 1]), v, nz)); 424 } 425 426 /* allocate space for the triangular factor information */ 427 PetscCall(PetscNew(&upTriFactor)); 428 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 429 430 /* Create the matrix description */ 431 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 432 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 433 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 434 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 435 #else 436 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 437 #endif 438 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 439 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 440 441 /* set the operation */ 442 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 443 444 /* set the matrix */ 445 upTriFactor->csrMat = new CsrMatrix; 446 upTriFactor->csrMat->num_rows = n; 447 upTriFactor->csrMat->num_cols = n; 448 upTriFactor->csrMat->num_entries = nzUpper; 449 450 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 451 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1); 452 453 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 454 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper); 455 456 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 457 upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper); 458 459 /* Create the solve analysis information */ 460 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 461 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 462 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 463 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 464 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 465 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 466 #endif 467 468 /* perform the solve analysis */ 469 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 470 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), 471 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 472 upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 473 #else 474 upTriFactor->solveInfo)); 475 #endif 476 PetscCallCUDA(WaitForCUDA()); 477 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 478 479 /* assign the pointer */ 480 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 481 upTriFactor->AA_h = AAUp; 482 PetscCallCUDA(cudaFreeHost(AiUp)); 483 PetscCallCUDA(cudaFreeHost(AjUp)); 484 PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar))); 485 } else { 486 if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar))); 487 /* Fill the upper triangular matrix */ 488 offset = nzUpper; 489 for (i = n - 1; i >= 0; i--) { 490 v = aa + adiag[i + 1] + 1; 491 492 /* number of elements NOT on the diagonal */ 493 nz = adiag[i] - adiag[i + 1] - 1; 494 495 /* decrement the offset */ 496 offset -= (nz + 1); 497 498 /* first, set the diagonal elements */ 499 upTriFactor->AA_h[offset] = 1. / v[nz]; 500 PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset + 1]), v, nz)); 501 } 502 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper); 503 PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar))); 504 } 505 } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); } 506 } 507 PetscFunctionReturn(0); 508 } 509 510 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) { 511 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 512 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 513 IS isrow = a->row, iscol = a->icol; 514 PetscBool row_identity, col_identity; 515 PetscInt n = A->rmap->n; 516 517 PetscFunctionBegin; 518 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 519 PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A)); 520 PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A)); 521 522 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 523 cusparseTriFactors->nnz = a->nz; 524 525 A->offloadmask = PETSC_OFFLOAD_BOTH; 526 /* lower triangular indices */ 527 PetscCall(ISIdentity(isrow, &row_identity)); 528 if (!row_identity && !cusparseTriFactors->rpermIndices) { 529 const PetscInt *r; 530 531 PetscCall(ISGetIndices(isrow, &r)); 532 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 533 cusparseTriFactors->rpermIndices->assign(r, r + n); 534 PetscCall(ISRestoreIndices(isrow, &r)); 535 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 536 } 537 538 /* upper triangular indices */ 539 PetscCall(ISIdentity(iscol, &col_identity)); 540 if (!col_identity && !cusparseTriFactors->cpermIndices) { 541 const PetscInt *c; 542 543 PetscCall(ISGetIndices(iscol, &c)); 544 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 545 cusparseTriFactors->cpermIndices->assign(c, c + n); 546 PetscCall(ISRestoreIndices(iscol, &c)); 547 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 548 } 549 PetscFunctionReturn(0); 550 } 551 552 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) { 553 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 554 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 555 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 556 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 557 PetscInt *AiUp, *AjUp; 558 PetscScalar *AAUp; 559 PetscScalar *AALo; 560 PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j; 561 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data; 562 const PetscInt *ai = b->i, *aj = b->j, *vj; 563 const MatScalar *aa = b->a, *v; 564 565 PetscFunctionBegin; 566 if (!n) PetscFunctionReturn(0); 567 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 568 try { 569 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 570 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar))); 571 if (!upTriFactor && !loTriFactor) { 572 /* Allocate Space for the upper triangular matrix */ 573 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 574 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 575 576 /* Fill the upper triangular matrix */ 577 AiUp[0] = (PetscInt)0; 578 AiUp[n] = nzUpper; 579 offset = 0; 580 for (i = 0; i < n; i++) { 581 /* set the pointers */ 582 v = aa + ai[i]; 583 vj = aj + ai[i]; 584 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 585 586 /* first, set the diagonal elements */ 587 AjUp[offset] = (PetscInt)i; 588 AAUp[offset] = (MatScalar)1.0 / v[nz]; 589 AiUp[i] = offset; 590 AALo[offset] = (MatScalar)1.0 / v[nz]; 591 592 offset += 1; 593 if (nz > 0) { 594 PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz)); 595 PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 596 for (j = offset; j < offset + nz; j++) { 597 AAUp[j] = -AAUp[j]; 598 AALo[j] = AAUp[j] / v[nz]; 599 } 600 offset += nz; 601 } 602 } 603 604 /* allocate space for the triangular factor information */ 605 PetscCall(PetscNew(&upTriFactor)); 606 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 607 608 /* Create the matrix description */ 609 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 610 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 611 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 612 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 613 #else 614 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 615 #endif 616 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 617 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 618 619 /* set the matrix */ 620 upTriFactor->csrMat = new CsrMatrix; 621 upTriFactor->csrMat->num_rows = A->rmap->n; 622 upTriFactor->csrMat->num_cols = A->cmap->n; 623 upTriFactor->csrMat->num_entries = a->nz; 624 625 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 626 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 627 628 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 629 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 630 631 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 632 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 633 634 /* set the operation */ 635 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 636 637 /* Create the solve analysis information */ 638 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 639 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 640 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 641 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 642 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 643 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 644 #endif 645 646 /* perform the solve analysis */ 647 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 648 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), 649 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 650 upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 651 #else 652 upTriFactor->solveInfo)); 653 #endif 654 PetscCallCUDA(WaitForCUDA()); 655 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 656 657 /* assign the pointer */ 658 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 659 660 /* allocate space for the triangular factor information */ 661 PetscCall(PetscNew(&loTriFactor)); 662 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 663 664 /* Create the matrix description */ 665 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 666 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 667 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 668 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 669 #else 670 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 671 #endif 672 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 673 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 674 675 /* set the operation */ 676 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 677 678 /* set the matrix */ 679 loTriFactor->csrMat = new CsrMatrix; 680 loTriFactor->csrMat->num_rows = A->rmap->n; 681 loTriFactor->csrMat->num_cols = A->cmap->n; 682 loTriFactor->csrMat->num_entries = a->nz; 683 684 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 685 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 686 687 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 688 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 689 690 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 691 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 692 693 /* Create the solve analysis information */ 694 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 695 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 696 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 697 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 698 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 699 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 700 #endif 701 702 /* perform the solve analysis */ 703 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 704 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), 705 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 706 loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 707 #else 708 loTriFactor->solveInfo)); 709 #endif 710 PetscCallCUDA(WaitForCUDA()); 711 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 712 713 /* assign the pointer */ 714 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 715 716 PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar)))); 717 PetscCallCUDA(cudaFreeHost(AiUp)); 718 PetscCallCUDA(cudaFreeHost(AjUp)); 719 } else { 720 /* Fill the upper triangular matrix */ 721 offset = 0; 722 for (i = 0; i < n; i++) { 723 /* set the pointers */ 724 v = aa + ai[i]; 725 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 726 727 /* first, set the diagonal elements */ 728 AAUp[offset] = 1.0 / v[nz]; 729 AALo[offset] = 1.0 / v[nz]; 730 731 offset += 1; 732 if (nz > 0) { 733 PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 734 for (j = offset; j < offset + nz; j++) { 735 AAUp[j] = -AAUp[j]; 736 AALo[j] = AAUp[j] / v[nz]; 737 } 738 offset += nz; 739 } 740 } 741 PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 742 PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 743 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 744 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 745 PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar))); 746 } 747 PetscCallCUDA(cudaFreeHost(AAUp)); 748 PetscCallCUDA(cudaFreeHost(AALo)); 749 } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); } 750 } 751 PetscFunctionReturn(0); 752 } 753 754 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) { 755 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 756 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 757 IS ip = a->row; 758 PetscBool perm_identity; 759 PetscInt n = A->rmap->n; 760 761 PetscFunctionBegin; 762 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 763 PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A)); 764 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 765 cusparseTriFactors->nnz = (a->nz - n) * 2 + n; 766 767 A->offloadmask = PETSC_OFFLOAD_BOTH; 768 769 /* lower triangular indices */ 770 PetscCall(ISIdentity(ip, &perm_identity)); 771 if (!perm_identity) { 772 IS iip; 773 const PetscInt *irip, *rip; 774 775 PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip)); 776 PetscCall(ISGetIndices(iip, &irip)); 777 PetscCall(ISGetIndices(ip, &rip)); 778 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 779 cusparseTriFactors->rpermIndices->assign(rip, rip + n); 780 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 781 cusparseTriFactors->cpermIndices->assign(irip, irip + n); 782 PetscCall(ISRestoreIndices(iip, &irip)); 783 PetscCall(ISDestroy(&iip)); 784 PetscCall(ISRestoreIndices(ip, &rip)); 785 PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt))); 786 } 787 PetscFunctionReturn(0); 788 } 789 790 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) { 791 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 792 IS ip = b->row; 793 PetscBool perm_identity; 794 795 PetscFunctionBegin; 796 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 797 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 798 B->offloadmask = PETSC_OFFLOAD_CPU; 799 /* determine which version of MatSolve needs to be used. */ 800 PetscCall(ISIdentity(ip, &perm_identity)); 801 if (perm_identity) { 802 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 803 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 804 B->ops->matsolve = NULL; 805 B->ops->matsolvetranspose = NULL; 806 } else { 807 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 808 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 809 B->ops->matsolve = NULL; 810 B->ops->matsolvetranspose = NULL; 811 } 812 813 /* get the triangular factors */ 814 PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B)); 815 PetscFunctionReturn(0); 816 } 817 818 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) { 819 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 820 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 821 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 822 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 823 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 824 cusparseIndexBase_t indexBase; 825 cusparseMatrixType_t matrixType; 826 cusparseFillMode_t fillMode; 827 cusparseDiagType_t diagType; 828 829 PetscFunctionBegin; 830 /* allocate space for the transpose of the lower triangular factor */ 831 PetscCall(PetscNew(&loTriFactorT)); 832 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 833 834 /* set the matrix descriptors of the lower triangular factor */ 835 matrixType = cusparseGetMatType(loTriFactor->descr); 836 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 837 fillMode = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 838 diagType = cusparseGetMatDiagType(loTriFactor->descr); 839 840 /* Create the matrix description */ 841 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr)); 842 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 843 PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType)); 844 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode)); 845 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType)); 846 847 /* set the operation */ 848 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 849 850 /* allocate GPU space for the CSC of the lower triangular factor*/ 851 loTriFactorT->csrMat = new CsrMatrix; 852 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 853 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 854 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 855 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1); 856 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 857 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 858 859 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 860 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 861 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), 862 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 863 loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 864 PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize)); 865 #endif 866 867 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 868 PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 869 loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), 870 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 871 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer)); 872 #else 873 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase)); 874 #endif 875 PetscCallCUDA(WaitForCUDA()); 876 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 877 878 /* Create the solve analysis information */ 879 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 880 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo)); 881 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 882 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 883 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize)); 884 PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize)); 885 #endif 886 887 /* perform the solve analysis */ 888 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 889 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 890 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 891 loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 892 #else 893 loTriFactorT->solveInfo)); 894 #endif 895 PetscCallCUDA(WaitForCUDA()); 896 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 897 898 /* assign the pointer */ 899 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 900 901 /*********************************************/ 902 /* Now the Transpose of the Upper Tri Factor */ 903 /*********************************************/ 904 905 /* allocate space for the transpose of the upper triangular factor */ 906 PetscCall(PetscNew(&upTriFactorT)); 907 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 908 909 /* set the matrix descriptors of the upper triangular factor */ 910 matrixType = cusparseGetMatType(upTriFactor->descr); 911 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 912 fillMode = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 913 diagType = cusparseGetMatDiagType(upTriFactor->descr); 914 915 /* Create the matrix description */ 916 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr)); 917 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 918 PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType)); 919 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode)); 920 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType)); 921 922 /* set the operation */ 923 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 924 925 /* allocate GPU space for the CSC of the upper triangular factor*/ 926 upTriFactorT->csrMat = new CsrMatrix; 927 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 928 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 929 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 930 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1); 931 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 932 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 933 934 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 935 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 936 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), 937 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 938 upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 939 PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize)); 940 #endif 941 942 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 943 PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 944 upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), 945 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 946 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer)); 947 #else 948 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase)); 949 #endif 950 951 PetscCallCUDA(WaitForCUDA()); 952 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 953 954 /* Create the solve analysis information */ 955 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 956 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo)); 957 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 958 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 959 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize)); 960 PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize)); 961 #endif 962 963 /* perform the solve analysis */ 964 /* christ, would it have killed you to put this stuff in a function????????? */ 965 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 966 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 967 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 968 upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 969 #else 970 upTriFactorT->solveInfo)); 971 #endif 972 973 PetscCallCUDA(WaitForCUDA()); 974 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 975 976 /* assign the pointer */ 977 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 978 PetscFunctionReturn(0); 979 } 980 981 struct PetscScalarToPetscInt { 982 __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); } 983 }; 984 985 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) { 986 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 987 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 988 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 989 cusparseStatus_t stat; 990 cusparseIndexBase_t indexBase; 991 992 PetscFunctionBegin; 993 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 994 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 995 PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct"); 996 matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 997 PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct"); 998 if (A->transupdated) PetscFunctionReturn(0); 999 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1000 PetscCall(PetscLogGpuTimeBegin()); 1001 if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 1002 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1003 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1004 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr)); 1005 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1006 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase)); 1007 PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1008 1009 /* set alpha and beta */ 1010 PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar))); 1011 PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero), sizeof(PetscScalar))); 1012 PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar))); 1013 PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1014 PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1015 PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1016 1017 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1018 CsrMatrix *matrixT = new CsrMatrix; 1019 matstructT->mat = matrixT; 1020 matrixT->num_rows = A->cmap->n; 1021 matrixT->num_cols = A->rmap->n; 1022 matrixT->num_entries = a->nz; 1023 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1); 1024 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1025 matrixT->values = new THRUSTARRAY(a->nz); 1026 1027 if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1028 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1029 1030 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1031 #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1) 1032 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1033 indexBase, cusparse_scalartype); 1034 PetscCallCUSPARSE(stat); 1035 #else 1036 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 1037 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 1038 1039 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 1040 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 1041 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 1042 */ 1043 if (matrixT->num_entries) { 1044 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype); 1045 PetscCallCUSPARSE(stat); 1046 1047 } else { 1048 matstructT->matDescr = NULL; 1049 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 1050 } 1051 #endif 1052 #endif 1053 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1054 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1055 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1056 #else 1057 CsrMatrix *temp = new CsrMatrix; 1058 CsrMatrix *tempT = new CsrMatrix; 1059 /* First convert HYB to CSR */ 1060 temp->num_rows = A->rmap->n; 1061 temp->num_cols = A->cmap->n; 1062 temp->num_entries = a->nz; 1063 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1064 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1065 temp->values = new THRUSTARRAY(a->nz); 1066 1067 stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get()); 1068 PetscCallCUSPARSE(stat); 1069 1070 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1071 tempT->num_rows = A->rmap->n; 1072 tempT->num_cols = A->cmap->n; 1073 tempT->num_entries = a->nz; 1074 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1075 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1076 tempT->values = new THRUSTARRAY(a->nz); 1077 1078 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(), 1079 tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1080 PetscCallCUSPARSE(stat); 1081 1082 /* Last, convert CSC to HYB */ 1083 cusparseHybMat_t hybMat; 1084 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 1085 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1086 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition); 1087 PetscCallCUSPARSE(stat); 1088 1089 /* assign the pointer */ 1090 matstructT->mat = hybMat; 1091 A->transupdated = PETSC_TRUE; 1092 /* delete temporaries */ 1093 if (tempT) { 1094 if (tempT->values) delete (THRUSTARRAY *)tempT->values; 1095 if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices; 1096 if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets; 1097 delete (CsrMatrix *)tempT; 1098 } 1099 if (temp) { 1100 if (temp->values) delete (THRUSTARRAY *)temp->values; 1101 if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices; 1102 if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets; 1103 delete (CsrMatrix *)temp; 1104 } 1105 #endif 1106 } 1107 } 1108 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1109 CsrMatrix *matrix = (CsrMatrix *)matstruct->mat; 1110 CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat; 1111 PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix"); 1112 PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows"); 1113 PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols"); 1114 PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values"); 1115 PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT"); 1116 PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows"); 1117 PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols"); 1118 PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values"); 1119 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1120 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1121 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1122 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 1123 } 1124 if (!cusparsestruct->csr2csc_i) { 1125 THRUSTARRAY csr2csc_a(matrix->num_entries); 1126 PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1127 1128 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1129 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1130 void *csr2cscBuffer; 1131 size_t csr2cscBufferSize; 1132 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1133 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize); 1134 PetscCallCUSPARSE(stat); 1135 PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize)); 1136 #endif 1137 1138 if (matrix->num_entries) { 1139 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1140 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1141 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1142 1143 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1144 should be filled with indexBase. So I just take a shortcut here. 1145 */ 1146 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1147 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1148 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer); 1149 PetscCallCUSPARSE(stat); 1150 #else 1151 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1152 PetscCallCUSPARSE(stat); 1153 #endif 1154 } else { 1155 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 1156 } 1157 1158 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1159 PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt())); 1160 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1161 PetscCallCUDA(cudaFree(csr2cscBuffer)); 1162 #endif 1163 } 1164 PetscCallThrust( 1165 thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin())); 1166 } 1167 PetscCall(PetscLogGpuTimeEnd()); 1168 PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1169 /* the compressed row indices is not used for matTranspose */ 1170 matstructT->cprowIndices = NULL; 1171 /* assign the pointer */ 1172 ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT; 1173 A->transupdated = PETSC_TRUE; 1174 PetscFunctionReturn(0); 1175 } 1176 1177 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1178 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) { 1179 PetscInt n = xx->map->n; 1180 const PetscScalar *barray; 1181 PetscScalar *xarray; 1182 thrust::device_ptr<const PetscScalar> bGPU; 1183 thrust::device_ptr<PetscScalar> xGPU; 1184 cusparseStatus_t stat; 1185 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1186 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1187 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1188 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1189 1190 PetscFunctionBegin; 1191 /* Analyze the matrix and create the transpose ... on the fly */ 1192 if (!loTriFactorT && !upTriFactorT) { 1193 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1194 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1195 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1196 } 1197 1198 /* Get the GPU pointers */ 1199 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1200 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1201 xGPU = thrust::device_pointer_cast(xarray); 1202 bGPU = thrust::device_pointer_cast(barray); 1203 1204 PetscCall(PetscLogGpuTimeBegin()); 1205 /* First, reorder with the row permutation */ 1206 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU); 1207 1208 /* First, solve U */ 1209 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, 1210 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1211 upTriFactorT->csrMat->num_entries, 1212 #endif 1213 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, 1214 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1215 tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer); 1216 PetscCallCUSPARSE(stat); 1217 #else 1218 tempGPU->data().get()); 1219 PetscCallCUSPARSE(stat); 1220 #endif 1221 1222 /* Then, solve L */ 1223 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, 1224 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1225 loTriFactorT->csrMat->num_entries, 1226 #endif 1227 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1228 tempGPU->data().get(), 1229 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1230 xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer); 1231 PetscCallCUSPARSE(stat); 1232 #else 1233 xarray); 1234 PetscCallCUSPARSE(stat); 1235 #endif 1236 1237 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1238 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin()); 1239 1240 /* Copy the temporary to the full solution. */ 1241 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU); 1242 1243 /* restore */ 1244 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1245 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1246 PetscCall(PetscLogGpuTimeEnd()); 1247 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1248 PetscFunctionReturn(0); 1249 } 1250 1251 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) { 1252 const PetscScalar *barray; 1253 PetscScalar *xarray; 1254 cusparseStatus_t stat; 1255 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1256 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1257 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1258 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1259 1260 PetscFunctionBegin; 1261 /* Analyze the matrix and create the transpose ... on the fly */ 1262 if (!loTriFactorT && !upTriFactorT) { 1263 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1264 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1265 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1266 } 1267 1268 /* Get the GPU pointers */ 1269 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1270 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1271 1272 PetscCall(PetscLogGpuTimeBegin()); 1273 /* First, solve U */ 1274 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, 1275 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1276 upTriFactorT->csrMat->num_entries, 1277 #endif 1278 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, 1279 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1280 tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer); 1281 PetscCallCUSPARSE(stat); 1282 #else 1283 tempGPU->data().get()); 1284 PetscCallCUSPARSE(stat); 1285 #endif 1286 1287 /* Then, solve L */ 1288 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, 1289 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1290 loTriFactorT->csrMat->num_entries, 1291 #endif 1292 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1293 tempGPU->data().get(), 1294 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1295 xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer); 1296 PetscCallCUSPARSE(stat); 1297 #else 1298 xarray); 1299 PetscCallCUSPARSE(stat); 1300 #endif 1301 1302 /* restore */ 1303 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1304 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1305 PetscCall(PetscLogGpuTimeEnd()); 1306 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1307 PetscFunctionReturn(0); 1308 } 1309 1310 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) { 1311 const PetscScalar *barray; 1312 PetscScalar *xarray; 1313 thrust::device_ptr<const PetscScalar> bGPU; 1314 thrust::device_ptr<PetscScalar> xGPU; 1315 cusparseStatus_t stat; 1316 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1317 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1318 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1319 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1320 1321 PetscFunctionBegin; 1322 1323 /* Get the GPU pointers */ 1324 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1325 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1326 xGPU = thrust::device_pointer_cast(xarray); 1327 bGPU = thrust::device_pointer_cast(barray); 1328 1329 PetscCall(PetscLogGpuTimeBegin()); 1330 /* First, reorder with the row permutation */ 1331 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin()); 1332 1333 /* Next, solve L */ 1334 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, 1335 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1336 loTriFactor->csrMat->num_entries, 1337 #endif 1338 &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 1339 tempGPU->data().get(), 1340 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1341 xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer); 1342 PetscCallCUSPARSE(stat); 1343 #else 1344 xarray); 1345 PetscCallCUSPARSE(stat); 1346 #endif 1347 1348 /* Then, solve U */ 1349 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, 1350 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1351 upTriFactor->csrMat->num_entries, 1352 #endif 1353 &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, 1354 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1355 tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer); 1356 PetscCallCUSPARSE(stat); 1357 #else 1358 tempGPU->data().get()); 1359 PetscCallCUSPARSE(stat); 1360 #endif 1361 1362 /* Last, reorder with the column permutation */ 1363 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU); 1364 1365 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1366 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1367 PetscCall(PetscLogGpuTimeEnd()); 1368 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1369 PetscFunctionReturn(0); 1370 } 1371 1372 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) { 1373 const PetscScalar *barray; 1374 PetscScalar *xarray; 1375 cusparseStatus_t stat; 1376 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1377 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1378 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1379 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1380 1381 PetscFunctionBegin; 1382 /* Get the GPU pointers */ 1383 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1384 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1385 1386 PetscCall(PetscLogGpuTimeBegin()); 1387 /* First, solve L */ 1388 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, 1389 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1390 loTriFactor->csrMat->num_entries, 1391 #endif 1392 &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, 1393 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1394 tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer); 1395 PetscCallCUSPARSE(stat); 1396 #else 1397 tempGPU->data().get()); 1398 PetscCallCUSPARSE(stat); 1399 #endif 1400 1401 /* Next, solve U */ 1402 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, 1403 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1404 upTriFactor->csrMat->num_entries, 1405 #endif 1406 &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 1407 tempGPU->data().get(), 1408 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1409 xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer); 1410 PetscCallCUSPARSE(stat); 1411 #else 1412 xarray); 1413 PetscCallCUSPARSE(stat); 1414 #endif 1415 1416 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1417 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1418 PetscCall(PetscLogGpuTimeEnd()); 1419 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1420 PetscFunctionReturn(0); 1421 } 1422 1423 #if CUSPARSE_VERSION >= 11500 1424 /* cusparseSpSV_solve() and friends first appeared in cusparse-11.3 */ 1425 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x) { 1426 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1427 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1428 const PetscScalar *barray; 1429 PetscScalar *xarray; 1430 1431 PetscFunctionBegin; 1432 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1433 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1434 PetscCall(PetscLogGpuTimeBegin()); 1435 1436 /* Solve L*y = b */ 1437 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1438 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1439 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */ 1440 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, 1441 fs->spsvDescr_L)); // cusparseSpSV_solve() scretely uses the external buffer used in cusparseSpSV_analysis()! 1442 1443 /* Solve U*x = y */ 1444 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1445 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */ 1446 fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U)); 1447 1448 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1449 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1450 1451 PetscCall(PetscLogGpuTimeEnd()); 1452 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1453 PetscFunctionReturn(0); 1454 } 1455 1456 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x) { 1457 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1458 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1459 const PetscScalar *barray; 1460 PetscScalar *xarray; 1461 1462 PetscFunctionBegin; 1463 if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */ 1464 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 1465 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do tranpose solve with it */ 1466 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 1467 1468 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); 1469 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 1470 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 1471 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 1472 fs->createdTransposeSpSVDescr = PETSC_TRUE; 1473 } 1474 1475 if (!fs->updatedTransposeSpSVAnalysis) { 1476 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1477 1478 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 1479 fs->updatedTransposeSpSVAnalysis = PETSC_TRUE; 1480 } 1481 1482 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1483 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1484 PetscCall(PetscLogGpuTimeBegin()); 1485 1486 /* Solve Ut*y = b */ 1487 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1488 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1489 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */ 1490 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut)); 1491 1492 /* Solve Lt*x = y */ 1493 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1494 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */ 1495 fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt)); 1496 1497 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1498 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1499 PetscCall(PetscLogGpuTimeEnd()); 1500 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1501 PetscFunctionReturn(0); 1502 } 1503 1504 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *info) { 1505 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1506 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1507 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1508 CsrMatrix *Acsr; 1509 PetscInt m, nz; 1510 PetscBool flg; 1511 1512 PetscFunctionBegin; 1513 if (PetscDefined(USE_DEBUG)) { 1514 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1515 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1516 } 1517 1518 /* Copy A's value to fact */ 1519 m = fact->rmap->n; 1520 nz = aij->nz; 1521 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1522 Acsr = (CsrMatrix *)Acusp->mat->mat; 1523 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1524 1525 /* Factorize fact inplace */ 1526 if (m) 1527 PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1528 fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1529 if (PetscDefined(USE_DEBUG)) { 1530 int numerical_zero; 1531 cusparseStatus_t status; 1532 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero); 1533 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1534 } 1535 1536 /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02() 1537 See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78 1538 */ 1539 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1540 1541 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 1542 1543 /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */ 1544 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 1545 1546 fact->offloadmask = PETSC_OFFLOAD_GPU; 1547 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ILU0; 1548 fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_ILU0; 1549 fact->ops->matsolve = NULL; 1550 fact->ops->matsolvetranspose = NULL; 1551 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1552 PetscFunctionReturn(0); 1553 } 1554 1555 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) { 1556 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1557 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1558 PetscInt m, nz; 1559 1560 PetscFunctionBegin; 1561 if (PetscDefined(USE_DEBUG)) { 1562 PetscInt i; 1563 PetscBool flg, missing; 1564 1565 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1566 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1567 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 1568 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1569 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1570 } 1571 1572 /* Free the old stale stuff */ 1573 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 1574 1575 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1576 but they will not be used. Allocate them just for easy debugging. 1577 */ 1578 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 1579 1580 fact->offloadmask = PETSC_OFFLOAD_BOTH; 1581 fact->factortype = MAT_FACTOR_ILU; 1582 fact->info.factor_mallocs = 0; 1583 fact->info.fill_ratio_given = info->fill; 1584 fact->info.fill_ratio_needed = 1.0; 1585 1586 aij->row = NULL; 1587 aij->col = NULL; 1588 1589 /* ====================================================================== */ 1590 /* Copy A's i, j to fact and also allocate the value array of fact. */ 1591 /* We'll do in-place factorization on fact */ 1592 /* ====================================================================== */ 1593 const int *Ai, *Aj; 1594 1595 m = fact->rmap->n; 1596 nz = aij->nz; 1597 1598 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1))); 1599 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz)); 1600 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz)); 1601 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */ 1602 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1603 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1604 1605 /* ====================================================================== */ 1606 /* Create descriptors for M, L, U */ 1607 /* ====================================================================== */ 1608 cusparseFillMode_t fillMode; 1609 cusparseDiagType_t diagType; 1610 1611 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 1612 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 1613 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 1614 1615 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 1616 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1617 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1618 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1619 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1620 */ 1621 fillMode = CUSPARSE_FILL_MODE_LOWER; 1622 diagType = CUSPARSE_DIAG_TYPE_UNIT; 1623 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1624 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1625 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1626 1627 fillMode = CUSPARSE_FILL_MODE_UPPER; 1628 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 1629 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1630 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1631 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1632 1633 /* ========================================================================= */ 1634 /* Query buffer sizes for csrilu0, SpSV and allocate buffers */ 1635 /* ========================================================================= */ 1636 PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M)); 1637 if (m) 1638 PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1639 fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, &fs->factBufferSize_M)); 1640 1641 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 1642 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 1643 1644 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 1645 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 1646 1647 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 1648 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 1649 1650 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 1651 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 1652 1653 /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab, 1654 and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77, 1655 spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U. 1656 To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U. 1657 */ 1658 if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) { 1659 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 1660 fs->spsvBuffer_L = fs->factBuffer_M; 1661 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 1662 } else { 1663 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M))); 1664 fs->spsvBuffer_U = fs->factBuffer_M; 1665 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 1666 } 1667 1668 /* ========================================================================== */ 1669 /* Perform analysis of ilu0 on M, SpSv on L and U */ 1670 /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/ 1671 /* ========================================================================== */ 1672 int structural_zero; 1673 cusparseStatus_t status; 1674 1675 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1676 if (m) 1677 PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1678 fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1679 if (PetscDefined(USE_DEBUG)) { 1680 /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 1681 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero); 1682 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero); 1683 } 1684 1685 /* Estimate FLOPs of the numeric factorization */ 1686 { 1687 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 1688 PetscInt *Ai, *Adiag, nzRow, nzLeft; 1689 PetscLogDouble flops = 0.0; 1690 1691 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 1692 Ai = Aseq->i; 1693 Adiag = Aseq->diag; 1694 for (PetscInt i = 0; i < m; i++) { 1695 if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */ 1696 nzRow = Ai[i + 1] - Ai[i]; 1697 nzLeft = Adiag[i] - Ai[i]; 1698 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 1699 and include the eliminated one will be updated, which incurs a multiplication and an addition. 1700 */ 1701 nzLeft = (nzRow - 1) / 2; 1702 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 1703 } 1704 } 1705 fs->numericFactFlops = flops; 1706 } 1707 fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0; 1708 PetscFunctionReturn(0); 1709 } 1710 1711 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x) { 1712 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1713 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1714 const PetscScalar *barray; 1715 PetscScalar *xarray; 1716 1717 PetscFunctionBegin; 1718 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1719 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1720 PetscCall(PetscLogGpuTimeBegin()); 1721 1722 /* Solve L*y = b */ 1723 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1724 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1725 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */ 1726 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L)); 1727 1728 /* Solve Lt*x = y */ 1729 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1730 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */ 1731 fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt)); 1732 1733 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1734 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1735 1736 PetscCall(PetscLogGpuTimeEnd()); 1737 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1738 PetscFunctionReturn(0); 1739 } 1740 1741 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *info) { 1742 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1743 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1744 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1745 CsrMatrix *Acsr; 1746 PetscInt m, nz; 1747 PetscBool flg; 1748 1749 PetscFunctionBegin; 1750 if (PetscDefined(USE_DEBUG)) { 1751 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1752 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1753 } 1754 1755 /* Copy A's value to fact */ 1756 m = fact->rmap->n; 1757 nz = aij->nz; 1758 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1759 Acsr = (CsrMatrix *)Acusp->mat->mat; 1760 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1761 1762 /* Factorize fact inplace */ 1763 /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve 1764 Function csric02() only takes the lower triangular part of matrix A to perform factorization. 1765 The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored, 1766 and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not. 1767 In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided. 1768 */ 1769 if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 1770 if (PetscDefined(USE_DEBUG)) { 1771 int numerical_zero; 1772 cusparseStatus_t status; 1773 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero); 1774 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1775 } 1776 1777 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1778 1779 /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE 1780 ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F 1781 */ 1782 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1783 1784 fact->offloadmask = PETSC_OFFLOAD_GPU; 1785 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0; 1786 fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0; 1787 fact->ops->matsolve = NULL; 1788 fact->ops->matsolvetranspose = NULL; 1789 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1790 PetscFunctionReturn(0); 1791 } 1792 1793 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS perm, const MatFactorInfo *info) { 1794 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1795 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1796 PetscInt m, nz; 1797 1798 PetscFunctionBegin; 1799 if (PetscDefined(USE_DEBUG)) { 1800 PetscInt i; 1801 PetscBool flg, missing; 1802 1803 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1804 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1805 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 1806 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1807 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1808 } 1809 1810 /* Free the old stale stuff */ 1811 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 1812 1813 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1814 but they will not be used. Allocate them just for easy debugging. 1815 */ 1816 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 1817 1818 fact->offloadmask = PETSC_OFFLOAD_BOTH; 1819 fact->factortype = MAT_FACTOR_ICC; 1820 fact->info.factor_mallocs = 0; 1821 fact->info.fill_ratio_given = info->fill; 1822 fact->info.fill_ratio_needed = 1.0; 1823 1824 aij->row = NULL; 1825 aij->col = NULL; 1826 1827 /* ====================================================================== */ 1828 /* Copy A's i, j to fact and also allocate the value array of fact. */ 1829 /* We'll do in-place factorization on fact */ 1830 /* ====================================================================== */ 1831 const int *Ai, *Aj; 1832 1833 m = fact->rmap->n; 1834 nz = aij->nz; 1835 1836 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1))); 1837 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz)); 1838 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz)); 1839 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */ 1840 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1841 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1842 1843 /* ====================================================================== */ 1844 /* Create mat descriptors for M, L */ 1845 /* ====================================================================== */ 1846 cusparseFillMode_t fillMode; 1847 cusparseDiagType_t diagType; 1848 1849 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 1850 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 1851 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 1852 1853 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 1854 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1855 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1856 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1857 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1858 */ 1859 fillMode = CUSPARSE_FILL_MODE_LOWER; 1860 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 1861 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1862 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1863 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1864 1865 /* ========================================================================= */ 1866 /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */ 1867 /* ========================================================================= */ 1868 PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M)); 1869 if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, &fs->factBufferSize_M)); 1870 1871 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 1872 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 1873 1874 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 1875 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 1876 1877 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 1878 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 1879 1880 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 1881 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 1882 1883 /* To save device memory, we make the factorization buffer share with one of the solver buffer. 1884 See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(). 1885 */ 1886 if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) { 1887 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 1888 fs->spsvBuffer_L = fs->factBuffer_M; 1889 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 1890 } else { 1891 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M))); 1892 fs->spsvBuffer_Lt = fs->factBuffer_M; 1893 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 1894 } 1895 1896 /* ========================================================================== */ 1897 /* Perform analysis of ic0 on M */ 1898 /* The lower triangular part of M has the same sparsity pattern as L */ 1899 /* ========================================================================== */ 1900 int structural_zero; 1901 cusparseStatus_t status; 1902 1903 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1904 if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 1905 if (PetscDefined(USE_DEBUG)) { 1906 /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 1907 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero); 1908 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero); 1909 } 1910 1911 /* Estimate FLOPs of the numeric factorization */ 1912 { 1913 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 1914 PetscInt *Ai, nzRow, nzLeft; 1915 PetscLogDouble flops = 0.0; 1916 1917 Ai = Aseq->i; 1918 for (PetscInt i = 0; i < m; i++) { 1919 nzRow = Ai[i + 1] - Ai[i]; 1920 if (nzRow > 1) { 1921 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 1922 and include the eliminated one will be updated, which incurs a multiplication and an addition. 1923 */ 1924 nzLeft = (nzRow - 1) / 2; 1925 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 1926 } 1927 } 1928 fs->numericFactFlops = flops; 1929 } 1930 fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0; 1931 PetscFunctionReturn(0); 1932 } 1933 #endif 1934 1935 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) { 1936 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 1937 1938 PetscFunctionBegin; 1939 #if CUSPARSE_VERSION >= 11500 1940 PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 1941 if (cusparseTriFactors->factorizeOnDevice) { 1942 PetscCall(ISIdentity(isrow, &row_identity)); 1943 PetscCall(ISIdentity(iscol, &col_identity)); 1944 } 1945 if (!info->levels && row_identity && col_identity) { 1946 PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info)); 1947 } else 1948 #endif 1949 { 1950 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 1951 PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1952 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 1953 } 1954 PetscFunctionReturn(0); 1955 } 1956 1957 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) { 1958 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 1959 1960 PetscFunctionBegin; 1961 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 1962 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1963 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 1964 PetscFunctionReturn(0); 1965 } 1966 1967 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) { 1968 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 1969 1970 PetscFunctionBegin; 1971 #if CUSPARSE_VERSION >= 11500 1972 PetscBool perm_identity = PETSC_FALSE; 1973 if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity)); 1974 if (!info->levels && perm_identity) { 1975 PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info)); 1976 } else 1977 #endif 1978 { 1979 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 1980 PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 1981 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 1982 } 1983 PetscFunctionReturn(0); 1984 } 1985 1986 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) { 1987 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 1988 1989 PetscFunctionBegin; 1990 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 1991 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); 1992 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 1993 PetscFunctionReturn(0); 1994 } 1995 1996 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A, MatSolverType *type) { 1997 PetscFunctionBegin; 1998 *type = MATSOLVERCUSPARSE; 1999 PetscFunctionReturn(0); 2000 } 2001 2002 /*MC 2003 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 2004 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 2005 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 2006 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 2007 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 2008 algorithms are not recommended. This class does NOT support direct solver operations. 2009 2010 Level: beginner 2011 2012 .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 2013 M*/ 2014 2015 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B) { 2016 PetscInt n = A->rmap->n; 2017 PetscBool factOnDevice, factOnHost; 2018 char *prefix; 2019 char factPlace[32] = "device"; /* the default */ 2020 2021 PetscFunctionBegin; 2022 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2023 PetscCall(MatSetSizes(*B, n, n, n, n)); 2024 (*B)->factortype = ftype; 2025 PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE)); 2026 2027 prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix; 2028 PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)), prefix, "MatGetFactor", "Mat"); 2029 PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL)); 2030 PetscOptionsEnd(); 2031 PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice)); 2032 PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost)); 2033 PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)(*B)), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace); 2034 ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice; 2035 2036 if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE)); 2037 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2038 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2039 if (!A->boundtocpu) { 2040 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 2041 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 2042 } else { 2043 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 2044 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 2045 } 2046 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 2047 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2048 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2049 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2050 if (!A->boundtocpu) { 2051 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 2052 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 2053 } else { 2054 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 2055 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 2056 } 2057 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2058 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2059 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types"); 2060 2061 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 2062 (*B)->canuseordering = PETSC_TRUE; 2063 PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse)); 2064 PetscFunctionReturn(0); 2065 } 2066 2067 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) { 2068 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2069 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2070 #if CUSPARSE_VERSION >= 13500 2071 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 2072 #endif 2073 2074 PetscFunctionBegin; 2075 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 2076 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2077 if (A->factortype == MAT_FACTOR_NONE) { 2078 CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat; 2079 PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2080 } 2081 #if CUSPARSE_VERSION >= 13500 2082 else if (fs->csrVal) { 2083 /* We have a factorized matrix on device and are able to copy it to host */ 2084 PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2085 } 2086 #endif 2087 else 2088 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host"); 2089 PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar))); 2090 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2091 A->offloadmask = PETSC_OFFLOAD_BOTH; 2092 } 2093 PetscFunctionReturn(0); 2094 } 2095 2096 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) { 2097 PetscFunctionBegin; 2098 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2099 *array = ((Mat_SeqAIJ *)A->data)->a; 2100 PetscFunctionReturn(0); 2101 } 2102 2103 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) { 2104 PetscFunctionBegin; 2105 A->offloadmask = PETSC_OFFLOAD_CPU; 2106 *array = NULL; 2107 PetscFunctionReturn(0); 2108 } 2109 2110 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) { 2111 PetscFunctionBegin; 2112 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2113 *array = ((Mat_SeqAIJ *)A->data)->a; 2114 PetscFunctionReturn(0); 2115 } 2116 2117 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) { 2118 PetscFunctionBegin; 2119 *array = NULL; 2120 PetscFunctionReturn(0); 2121 } 2122 2123 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) { 2124 PetscFunctionBegin; 2125 *array = ((Mat_SeqAIJ *)A->data)->a; 2126 PetscFunctionReturn(0); 2127 } 2128 2129 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) { 2130 PetscFunctionBegin; 2131 A->offloadmask = PETSC_OFFLOAD_CPU; 2132 *array = NULL; 2133 PetscFunctionReturn(0); 2134 } 2135 2136 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) { 2137 Mat_SeqAIJCUSPARSE *cusp; 2138 CsrMatrix *matrix; 2139 2140 PetscFunctionBegin; 2141 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2142 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 2143 cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 2144 PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL"); 2145 matrix = (CsrMatrix *)cusp->mat->mat; 2146 2147 if (i) { 2148 #if !defined(PETSC_USE_64BIT_INDICES) 2149 *i = matrix->row_offsets->data().get(); 2150 #else 2151 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 2152 #endif 2153 } 2154 if (j) { 2155 #if !defined(PETSC_USE_64BIT_INDICES) 2156 *j = matrix->column_indices->data().get(); 2157 #else 2158 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 2159 #endif 2160 } 2161 if (a) *a = matrix->values->data().get(); 2162 if (mtype) *mtype = PETSC_MEMTYPE_CUDA; 2163 PetscFunctionReturn(0); 2164 } 2165 2166 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) { 2167 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 2168 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 2169 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2170 PetscInt m = A->rmap->n, *ii, *ridx, tmp; 2171 cusparseStatus_t stat; 2172 PetscBool both = PETSC_TRUE; 2173 2174 PetscFunctionBegin; 2175 PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU"); 2176 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 2177 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 2178 CsrMatrix *matrix; 2179 matrix = (CsrMatrix *)cusparsestruct->mat->mat; 2180 2181 PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values"); 2182 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2183 matrix->values->assign(a->a, a->a + a->nz); 2184 PetscCallCUDA(WaitForCUDA()); 2185 PetscCall(PetscLogCpuToGpu((a->nz) * sizeof(PetscScalar))); 2186 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2187 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 2188 } else { 2189 PetscInt nnz; 2190 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2191 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format)); 2192 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 2193 delete cusparsestruct->workVector; 2194 delete cusparsestruct->rowoffsets_gpu; 2195 cusparsestruct->workVector = NULL; 2196 cusparsestruct->rowoffsets_gpu = NULL; 2197 try { 2198 if (a->compressedrow.use) { 2199 m = a->compressedrow.nrows; 2200 ii = a->compressedrow.i; 2201 ridx = a->compressedrow.rindex; 2202 } else { 2203 m = A->rmap->n; 2204 ii = a->i; 2205 ridx = NULL; 2206 } 2207 PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data"); 2208 if (!a->a) { 2209 nnz = ii[m]; 2210 both = PETSC_FALSE; 2211 } else nnz = a->nz; 2212 PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data"); 2213 2214 /* create cusparse matrix */ 2215 cusparsestruct->nrows = m; 2216 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 2217 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr)); 2218 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO)); 2219 PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 2220 2221 PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar))); 2222 PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero), sizeof(PetscScalar))); 2223 PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar))); 2224 PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2225 PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2226 PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2227 PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2228 2229 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 2230 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2231 /* set the matrix */ 2232 CsrMatrix *mat = new CsrMatrix; 2233 mat->num_rows = m; 2234 mat->num_cols = A->cmap->n; 2235 mat->num_entries = nnz; 2236 mat->row_offsets = new THRUSTINTARRAY32(m + 1); 2237 mat->row_offsets->assign(ii, ii + m + 1); 2238 2239 mat->column_indices = new THRUSTINTARRAY32(nnz); 2240 mat->column_indices->assign(a->j, a->j + nnz); 2241 2242 mat->values = new THRUSTARRAY(nnz); 2243 if (a->a) mat->values->assign(a->a, a->a + nnz); 2244 2245 /* assign the pointer */ 2246 matstruct->mat = mat; 2247 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2248 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 2249 stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2250 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2251 PetscCallCUSPARSE(stat); 2252 } 2253 #endif 2254 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 2255 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2256 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2257 #else 2258 CsrMatrix *mat = new CsrMatrix; 2259 mat->num_rows = m; 2260 mat->num_cols = A->cmap->n; 2261 mat->num_entries = nnz; 2262 mat->row_offsets = new THRUSTINTARRAY32(m + 1); 2263 mat->row_offsets->assign(ii, ii + m + 1); 2264 2265 mat->column_indices = new THRUSTINTARRAY32(nnz); 2266 mat->column_indices->assign(a->j, a->j + nnz); 2267 2268 mat->values = new THRUSTARRAY(nnz); 2269 if (a->a) mat->values->assign(a->a, a->a + nnz); 2270 2271 cusparseHybMat_t hybMat; 2272 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 2273 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 2274 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition); 2275 PetscCallCUSPARSE(stat); 2276 /* assign the pointer */ 2277 matstruct->mat = hybMat; 2278 2279 if (mat) { 2280 if (mat->values) delete (THRUSTARRAY *)mat->values; 2281 if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices; 2282 if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets; 2283 delete (CsrMatrix *)mat; 2284 } 2285 #endif 2286 } 2287 2288 /* assign the compressed row indices */ 2289 if (a->compressedrow.use) { 2290 cusparsestruct->workVector = new THRUSTARRAY(m); 2291 matstruct->cprowIndices = new THRUSTINTARRAY(m); 2292 matstruct->cprowIndices->assign(ridx, ridx + m); 2293 tmp = m; 2294 } else { 2295 cusparsestruct->workVector = NULL; 2296 matstruct->cprowIndices = NULL; 2297 tmp = 0; 2298 } 2299 PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar))); 2300 2301 /* assign the pointer */ 2302 cusparsestruct->mat = matstruct; 2303 } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); } 2304 PetscCallCUDA(WaitForCUDA()); 2305 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2306 cusparsestruct->nonzerostate = A->nonzerostate; 2307 } 2308 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 2309 } 2310 PetscFunctionReturn(0); 2311 } 2312 2313 struct VecCUDAPlusEquals { 2314 template <typename Tuple> 2315 __host__ __device__ void operator()(Tuple t) { 2316 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 2317 } 2318 }; 2319 2320 struct VecCUDAEquals { 2321 template <typename Tuple> 2322 __host__ __device__ void operator()(Tuple t) { 2323 thrust::get<1>(t) = thrust::get<0>(t); 2324 } 2325 }; 2326 2327 struct VecCUDAEqualsReverse { 2328 template <typename Tuple> 2329 __host__ __device__ void operator()(Tuple t) { 2330 thrust::get<0>(t) = thrust::get<1>(t); 2331 } 2332 }; 2333 2334 struct MatMatCusparse { 2335 PetscBool cisdense; 2336 PetscScalar *Bt; 2337 Mat X; 2338 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2339 PetscLogDouble flops; 2340 CsrMatrix *Bcsr; 2341 2342 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2343 cusparseSpMatDescr_t matSpBDescr; 2344 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2345 cusparseDnMatDescr_t matBDescr; 2346 cusparseDnMatDescr_t matCDescr; 2347 PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/ 2348 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2349 void *dBuffer4; 2350 void *dBuffer5; 2351 #endif 2352 size_t mmBufferSize; 2353 void *mmBuffer; 2354 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2355 cusparseSpGEMMDescr_t spgemmDesc; 2356 #endif 2357 }; 2358 2359 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) { 2360 MatMatCusparse *mmdata = (MatMatCusparse *)data; 2361 2362 PetscFunctionBegin; 2363 PetscCallCUDA(cudaFree(mmdata->Bt)); 2364 delete mmdata->Bcsr; 2365 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2366 if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr)); 2367 if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2368 if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2369 if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2370 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2371 if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4)); 2372 if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5)); 2373 #endif 2374 if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2375 if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2)); 2376 #endif 2377 PetscCall(MatDestroy(&mmdata->X)); 2378 PetscCall(PetscFree(data)); 2379 PetscFunctionReturn(0); 2380 } 2381 2382 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat, Mat, Mat, PetscBool, PetscBool); 2383 2384 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) { 2385 Mat_Product *product = C->product; 2386 Mat A, B; 2387 PetscInt m, n, blda, clda; 2388 PetscBool flg, biscuda; 2389 Mat_SeqAIJCUSPARSE *cusp; 2390 cusparseStatus_t stat; 2391 cusparseOperation_t opA; 2392 const PetscScalar *barray; 2393 PetscScalar *carray; 2394 MatMatCusparse *mmdata; 2395 Mat_SeqAIJCUSPARSEMultStruct *mat; 2396 CsrMatrix *csrmat; 2397 2398 PetscFunctionBegin; 2399 MatCheckProduct(C, 1); 2400 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2401 mmdata = (MatMatCusparse *)product->data; 2402 A = product->A; 2403 B = product->B; 2404 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2405 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2406 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2407 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2408 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2409 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2410 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2411 switch (product->type) { 2412 case MATPRODUCT_AB: 2413 case MATPRODUCT_PtAP: 2414 mat = cusp->mat; 2415 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2416 m = A->rmap->n; 2417 n = B->cmap->n; 2418 break; 2419 case MATPRODUCT_AtB: 2420 if (!A->form_explicit_transpose) { 2421 mat = cusp->mat; 2422 opA = CUSPARSE_OPERATION_TRANSPOSE; 2423 } else { 2424 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2425 mat = cusp->matTranspose; 2426 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2427 } 2428 m = A->cmap->n; 2429 n = B->cmap->n; 2430 break; 2431 case MATPRODUCT_ABt: 2432 case MATPRODUCT_RARt: 2433 mat = cusp->mat; 2434 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2435 m = A->rmap->n; 2436 n = B->rmap->n; 2437 break; 2438 default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2439 } 2440 PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 2441 csrmat = (CsrMatrix *)mat->mat; 2442 /* if the user passed a CPU matrix, copy the data to the GPU */ 2443 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda)); 2444 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B)); 2445 PetscCall(MatDenseCUDAGetArrayRead(B, &barray)); 2446 2447 PetscCall(MatDenseGetLDA(B, &blda)); 2448 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2449 PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X, &carray)); 2450 PetscCall(MatDenseGetLDA(mmdata->X, &clda)); 2451 } else { 2452 PetscCall(MatDenseCUDAGetArrayWrite(C, &carray)); 2453 PetscCall(MatDenseGetLDA(C, &clda)); 2454 } 2455 2456 PetscCall(PetscLogGpuTimeBegin()); 2457 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2458 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2459 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2460 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2461 size_t mmBufferSize; 2462 if (mmdata->initialized && mmdata->Blda != blda) { 2463 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2464 mmdata->matBDescr = NULL; 2465 } 2466 if (!mmdata->matBDescr) { 2467 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2468 mmdata->Blda = blda; 2469 } 2470 2471 if (mmdata->initialized && mmdata->Clda != clda) { 2472 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2473 mmdata->matCDescr = NULL; 2474 } 2475 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2476 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2477 mmdata->Clda = clda; 2478 } 2479 2480 if (!mat->matDescr) { 2481 stat = cusparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2482 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2483 PetscCallCUSPARSE(stat); 2484 } 2485 stat = cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize); 2486 PetscCallCUSPARSE(stat); 2487 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2488 PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2489 PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize)); 2490 mmdata->mmBufferSize = mmBufferSize; 2491 } 2492 mmdata->initialized = PETSC_TRUE; 2493 } else { 2494 /* to be safe, always update pointers of the mats */ 2495 PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr, csrmat->values->data().get())); 2496 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray)); 2497 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray)); 2498 } 2499 2500 /* do cusparseSpMM, which supports transpose on B */ 2501 stat = cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer); 2502 PetscCallCUSPARSE(stat); 2503 #else 2504 PetscInt k; 2505 /* cusparseXcsrmm does not support transpose on B */ 2506 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2507 cublasHandle_t cublasv2handle; 2508 cublasStatus_t cerr; 2509 2510 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 2511 cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n); 2512 PetscCallCUBLAS(cerr); 2513 blda = B->cmap->n; 2514 k = B->cmap->n; 2515 } else { 2516 k = B->rmap->n; 2517 } 2518 2519 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2520 stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda); 2521 PetscCallCUSPARSE(stat); 2522 #endif 2523 PetscCall(PetscLogGpuTimeEnd()); 2524 PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries)); 2525 PetscCall(MatDenseCUDARestoreArrayRead(B, &barray)); 2526 if (product->type == MATPRODUCT_RARt) { 2527 PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X, &carray)); 2528 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE)); 2529 } else if (product->type == MATPRODUCT_PtAP) { 2530 PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X, &carray)); 2531 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE)); 2532 } else { 2533 PetscCall(MatDenseCUDARestoreArrayWrite(C, &carray)); 2534 } 2535 if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C)); 2536 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B)); 2537 PetscFunctionReturn(0); 2538 } 2539 2540 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) { 2541 Mat_Product *product = C->product; 2542 Mat A, B; 2543 PetscInt m, n; 2544 PetscBool cisdense, flg; 2545 MatMatCusparse *mmdata; 2546 Mat_SeqAIJCUSPARSE *cusp; 2547 2548 PetscFunctionBegin; 2549 MatCheckProduct(C, 1); 2550 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 2551 A = product->A; 2552 B = product->B; 2553 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2554 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2555 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2556 PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2557 switch (product->type) { 2558 case MATPRODUCT_AB: 2559 m = A->rmap->n; 2560 n = B->cmap->n; 2561 break; 2562 case MATPRODUCT_AtB: 2563 m = A->cmap->n; 2564 n = B->cmap->n; 2565 break; 2566 case MATPRODUCT_ABt: 2567 m = A->rmap->n; 2568 n = B->rmap->n; 2569 break; 2570 case MATPRODUCT_PtAP: 2571 m = B->cmap->n; 2572 n = B->cmap->n; 2573 break; 2574 case MATPRODUCT_RARt: 2575 m = B->rmap->n; 2576 n = B->rmap->n; 2577 break; 2578 default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2579 } 2580 PetscCall(MatSetSizes(C, m, n, m, n)); 2581 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2582 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense)); 2583 PetscCall(MatSetType(C, MATSEQDENSECUDA)); 2584 2585 /* product data */ 2586 PetscCall(PetscNew(&mmdata)); 2587 mmdata->cisdense = cisdense; 2588 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0) 2589 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2590 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar))); 2591 #endif 2592 /* for these products we need intermediate storage */ 2593 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2594 PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X)); 2595 PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA)); 2596 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2597 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n)); 2598 } else { 2599 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n)); 2600 } 2601 } 2602 C->product->data = mmdata; 2603 C->product->destroy = MatDestroy_MatMatCusparse; 2604 2605 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2606 PetscFunctionReturn(0); 2607 } 2608 2609 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) { 2610 Mat_Product *product = C->product; 2611 Mat A, B; 2612 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 2613 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 2614 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 2615 CsrMatrix *Acsr, *Bcsr, *Ccsr; 2616 PetscBool flg; 2617 cusparseStatus_t stat; 2618 MatProductType ptype; 2619 MatMatCusparse *mmdata; 2620 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2621 cusparseSpMatDescr_t BmatSpDescr; 2622 #endif 2623 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2624 2625 PetscFunctionBegin; 2626 MatCheckProduct(C, 1); 2627 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2628 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg)); 2629 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name); 2630 mmdata = (MatMatCusparse *)C->product->data; 2631 A = product->A; 2632 B = product->B; 2633 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2634 mmdata->reusesym = PETSC_FALSE; 2635 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2636 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2637 Cmat = Ccusp->mat; 2638 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]); 2639 Ccsr = (CsrMatrix *)Cmat->mat; 2640 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2641 goto finalize; 2642 } 2643 if (!c->nz) goto finalize; 2644 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2645 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2646 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 2647 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 2648 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2649 PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2650 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2651 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 2652 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2653 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2654 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2655 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2656 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2657 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2658 2659 ptype = product->type; 2660 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 2661 ptype = MATPRODUCT_AB; 2662 PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric"); 2663 } 2664 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 2665 ptype = MATPRODUCT_AB; 2666 PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric"); 2667 } 2668 switch (ptype) { 2669 case MATPRODUCT_AB: 2670 Amat = Acusp->mat; 2671 Bmat = Bcusp->mat; 2672 break; 2673 case MATPRODUCT_AtB: 2674 Amat = Acusp->matTranspose; 2675 Bmat = Bcusp->mat; 2676 break; 2677 case MATPRODUCT_ABt: 2678 Amat = Acusp->mat; 2679 Bmat = Bcusp->matTranspose; 2680 break; 2681 default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2682 } 2683 Cmat = Ccusp->mat; 2684 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 2685 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 2686 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]); 2687 Acsr = (CsrMatrix *)Amat->mat; 2688 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */ 2689 Ccsr = (CsrMatrix *)Cmat->mat; 2690 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 2691 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 2692 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2693 PetscCall(PetscLogGpuTimeBegin()); 2694 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2695 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2696 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2697 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2698 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2699 PetscCallCUSPARSE(stat); 2700 #else 2701 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 2702 PetscCallCUSPARSE(stat); 2703 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2704 PetscCallCUSPARSE(stat); 2705 #endif 2706 #else 2707 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 2708 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 2709 PetscCallCUSPARSE(stat); 2710 #endif 2711 PetscCall(PetscLogGpuFlops(mmdata->flops)); 2712 PetscCallCUDA(WaitForCUDA()); 2713 PetscCall(PetscLogGpuTimeEnd()); 2714 C->offloadmask = PETSC_OFFLOAD_GPU; 2715 finalize: 2716 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2717 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 2718 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 2719 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 2720 c->reallocs = 0; 2721 C->info.mallocs += 0; 2722 C->info.nz_unneeded = 0; 2723 C->assembled = C->was_assembled = PETSC_TRUE; 2724 C->num_ass++; 2725 PetscFunctionReturn(0); 2726 } 2727 2728 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) { 2729 Mat_Product *product = C->product; 2730 Mat A, B; 2731 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 2732 Mat_SeqAIJ *a, *b, *c; 2733 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 2734 CsrMatrix *Acsr, *Bcsr, *Ccsr; 2735 PetscInt i, j, m, n, k; 2736 PetscBool flg; 2737 cusparseStatus_t stat; 2738 MatProductType ptype; 2739 MatMatCusparse *mmdata; 2740 PetscLogDouble flops; 2741 PetscBool biscompressed, ciscompressed; 2742 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2743 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2744 cusparseSpMatDescr_t BmatSpDescr; 2745 #else 2746 int cnz; 2747 #endif 2748 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2749 2750 PetscFunctionBegin; 2751 MatCheckProduct(C, 1); 2752 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 2753 A = product->A; 2754 B = product->B; 2755 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2756 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2757 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 2758 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 2759 a = (Mat_SeqAIJ *)A->data; 2760 b = (Mat_SeqAIJ *)B->data; 2761 /* product data */ 2762 PetscCall(PetscNew(&mmdata)); 2763 C->product->data = mmdata; 2764 C->product->destroy = MatDestroy_MatMatCusparse; 2765 2766 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2767 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2768 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 2769 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 2770 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2771 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2772 2773 ptype = product->type; 2774 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 2775 ptype = MATPRODUCT_AB; 2776 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 2777 } 2778 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 2779 ptype = MATPRODUCT_AB; 2780 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 2781 } 2782 biscompressed = PETSC_FALSE; 2783 ciscompressed = PETSC_FALSE; 2784 switch (ptype) { 2785 case MATPRODUCT_AB: 2786 m = A->rmap->n; 2787 n = B->cmap->n; 2788 k = A->cmap->n; 2789 Amat = Acusp->mat; 2790 Bmat = Bcusp->mat; 2791 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2792 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2793 break; 2794 case MATPRODUCT_AtB: 2795 m = A->cmap->n; 2796 n = B->cmap->n; 2797 k = A->rmap->n; 2798 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2799 Amat = Acusp->matTranspose; 2800 Bmat = Bcusp->mat; 2801 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2802 break; 2803 case MATPRODUCT_ABt: 2804 m = A->rmap->n; 2805 n = B->rmap->n; 2806 k = A->cmap->n; 2807 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 2808 Amat = Acusp->mat; 2809 Bmat = Bcusp->matTranspose; 2810 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2811 break; 2812 default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2813 } 2814 2815 /* create cusparse matrix */ 2816 PetscCall(MatSetSizes(C, m, n, m, n)); 2817 PetscCall(MatSetType(C, MATSEQAIJCUSPARSE)); 2818 c = (Mat_SeqAIJ *)C->data; 2819 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2820 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2821 Ccsr = new CsrMatrix; 2822 2823 c->compressedrow.use = ciscompressed; 2824 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2825 c->compressedrow.nrows = a->compressedrow.nrows; 2826 PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex)); 2827 PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows)); 2828 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2829 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2830 Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows); 2831 } else { 2832 c->compressedrow.nrows = 0; 2833 c->compressedrow.i = NULL; 2834 c->compressedrow.rindex = NULL; 2835 Ccusp->workVector = NULL; 2836 Cmat->cprowIndices = NULL; 2837 } 2838 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2839 Ccusp->mat = Cmat; 2840 Ccusp->mat->mat = Ccsr; 2841 Ccsr->num_rows = Ccusp->nrows; 2842 Ccsr->num_cols = n; 2843 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1); 2844 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 2845 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 2846 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 2847 PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar))); 2848 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar))); 2849 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 2850 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2851 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2852 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2853 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2854 thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0); 2855 c->nz = 0; 2856 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2857 Ccsr->values = new THRUSTARRAY(c->nz); 2858 goto finalizesym; 2859 } 2860 2861 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 2862 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 2863 Acsr = (CsrMatrix *)Amat->mat; 2864 if (!biscompressed) { 2865 Bcsr = (CsrMatrix *)Bmat->mat; 2866 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2867 BmatSpDescr = Bmat->matDescr; 2868 #endif 2869 } else { /* we need to use row offsets for the full matrix */ 2870 CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat; 2871 Bcsr = new CsrMatrix; 2872 Bcsr->num_rows = B->rmap->n; 2873 Bcsr->num_cols = cBcsr->num_cols; 2874 Bcsr->num_entries = cBcsr->num_entries; 2875 Bcsr->column_indices = cBcsr->column_indices; 2876 Bcsr->values = cBcsr->values; 2877 if (!Bcusp->rowoffsets_gpu) { 2878 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2879 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 2880 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 2881 } 2882 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2883 mmdata->Bcsr = Bcsr; 2884 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2885 if (Bcsr->num_rows && Bcsr->num_cols) { 2886 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2887 PetscCallCUSPARSE(stat); 2888 } 2889 BmatSpDescr = mmdata->matSpBDescr; 2890 #endif 2891 } 2892 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 2893 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 2894 /* precompute flops count */ 2895 if (ptype == MATPRODUCT_AB) { 2896 for (i = 0, flops = 0; i < A->rmap->n; i++) { 2897 const PetscInt st = a->i[i]; 2898 const PetscInt en = a->i[i + 1]; 2899 for (j = st; j < en; j++) { 2900 const PetscInt brow = a->j[j]; 2901 flops += 2. * (b->i[brow + 1] - b->i[brow]); 2902 } 2903 } 2904 } else if (ptype == MATPRODUCT_AtB) { 2905 for (i = 0, flops = 0; i < A->rmap->n; i++) { 2906 const PetscInt anzi = a->i[i + 1] - a->i[i]; 2907 const PetscInt bnzi = b->i[i + 1] - b->i[i]; 2908 flops += (2. * anzi) * bnzi; 2909 } 2910 } else { /* TODO */ 2911 flops = 0.; 2912 } 2913 2914 mmdata->flops = flops; 2915 PetscCall(PetscLogGpuTimeBegin()); 2916 2917 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2918 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2919 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, NULL, NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2920 PetscCallCUSPARSE(stat); 2921 PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 2922 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2923 { 2924 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 2925 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 2926 */ 2927 void *dBuffer1 = NULL; 2928 void *dBuffer2 = NULL; 2929 void *dBuffer3 = NULL; 2930 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 2931 size_t bufferSize1 = 0; 2932 size_t bufferSize2 = 0; 2933 size_t bufferSize3 = 0; 2934 size_t bufferSize4 = 0; 2935 size_t bufferSize5 = 0; 2936 2937 /*----------------------------------------------------------------------*/ 2938 /* ask bufferSize1 bytes for external memory */ 2939 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL); 2940 PetscCallCUSPARSE(stat); 2941 PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1)); 2942 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2943 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1); 2944 PetscCallCUSPARSE(stat); 2945 2946 /*----------------------------------------------------------------------*/ 2947 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL); 2948 PetscCallCUSPARSE(stat); 2949 PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2)); 2950 PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3)); 2951 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4)); 2952 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4); 2953 PetscCallCUSPARSE(stat); 2954 PetscCallCUDA(cudaFree(dBuffer1)); 2955 PetscCallCUDA(cudaFree(dBuffer2)); 2956 2957 /*----------------------------------------------------------------------*/ 2958 /* get matrix C non-zero entries C_nnz1 */ 2959 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 2960 c->nz = (PetscInt)C_nnz1; 2961 /* allocate matrix C */ 2962 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2963 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2964 Ccsr->values = new THRUSTARRAY(c->nz); 2965 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2966 /* update matC with the new pointers */ 2967 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 2968 PetscCallCUSPARSE(stat); 2969 2970 /*----------------------------------------------------------------------*/ 2971 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL); 2972 PetscCallCUSPARSE(stat); 2973 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5)); 2974 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5); 2975 PetscCallCUSPARSE(stat); 2976 PetscCallCUDA(cudaFree(dBuffer3)); 2977 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2978 PetscCallCUSPARSE(stat); 2979 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024)); 2980 } 2981 #else 2982 size_t bufSize2; 2983 /* ask bufferSize bytes for external memory */ 2984 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL); 2985 PetscCallCUSPARSE(stat); 2986 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2)); 2987 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2988 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2); 2989 PetscCallCUSPARSE(stat); 2990 /* ask bufferSize again bytes for external memory */ 2991 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL); 2992 PetscCallCUSPARSE(stat); 2993 /* The CUSPARSE documentation is not clear, nor the API 2994 We need both buffers to perform the operations properly! 2995 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2996 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2997 is stored in the descriptor! What a messy API... */ 2998 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize)); 2999 /* compute the intermediate product of A * B */ 3000 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 3001 PetscCallCUSPARSE(stat); 3002 /* get matrix C non-zero entries C_nnz1 */ 3003 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3004 c->nz = (PetscInt)C_nnz1; 3005 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024, 3006 mmdata->mmBufferSize / 1024)); 3007 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3008 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3009 Ccsr->values = new THRUSTARRAY(c->nz); 3010 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3011 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 3012 PetscCallCUSPARSE(stat); 3013 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 3014 PetscCallCUSPARSE(stat); 3015 #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3016 #else 3017 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST)); 3018 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 3019 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz); 3020 PetscCallCUSPARSE(stat); 3021 c->nz = cnz; 3022 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3023 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3024 Ccsr->values = new THRUSTARRAY(c->nz); 3025 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3026 3027 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3028 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 3029 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 3030 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 3031 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 3032 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 3033 PetscCallCUSPARSE(stat); 3034 #endif 3035 PetscCall(PetscLogGpuFlops(mmdata->flops)); 3036 PetscCall(PetscLogGpuTimeEnd()); 3037 finalizesym: 3038 c->singlemalloc = PETSC_FALSE; 3039 c->free_a = PETSC_TRUE; 3040 c->free_ij = PETSC_TRUE; 3041 PetscCall(PetscMalloc1(m + 1, &c->i)); 3042 PetscCall(PetscMalloc1(c->nz, &c->j)); 3043 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 3044 PetscInt *d_i = c->i; 3045 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 3046 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 3047 ii = *Ccsr->row_offsets; 3048 jj = *Ccsr->column_indices; 3049 if (ciscompressed) d_i = c->compressedrow.i; 3050 PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3051 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3052 } else { 3053 PetscInt *d_i = c->i; 3054 if (ciscompressed) d_i = c->compressedrow.i; 3055 PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3056 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3057 } 3058 if (ciscompressed) { /* need to expand host row offsets */ 3059 PetscInt r = 0; 3060 c->i[0] = 0; 3061 for (k = 0; k < c->compressedrow.nrows; k++) { 3062 const PetscInt next = c->compressedrow.rindex[k]; 3063 const PetscInt old = c->compressedrow.i[k]; 3064 for (; r < next; r++) c->i[r + 1] = old; 3065 } 3066 for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows]; 3067 } 3068 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 3069 PetscCall(PetscMalloc1(m, &c->ilen)); 3070 PetscCall(PetscMalloc1(m, &c->imax)); 3071 c->maxnz = c->nz; 3072 c->nonzerorowcnt = 0; 3073 c->rmax = 0; 3074 for (k = 0; k < m; k++) { 3075 const PetscInt nn = c->i[k + 1] - c->i[k]; 3076 c->ilen[k] = c->imax[k] = nn; 3077 c->nonzerorowcnt += (PetscInt) !!nn; 3078 c->rmax = PetscMax(c->rmax, nn); 3079 } 3080 PetscCall(MatMarkDiagonal_SeqAIJ(C)); 3081 PetscCall(PetscMalloc1(c->nz, &c->a)); 3082 Ccsr->num_entries = c->nz; 3083 3084 C->nonzerostate++; 3085 PetscCall(PetscLayoutSetUp(C->rmap)); 3086 PetscCall(PetscLayoutSetUp(C->cmap)); 3087 Ccusp->nonzerostate = C->nonzerostate; 3088 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3089 C->preallocated = PETSC_TRUE; 3090 C->assembled = PETSC_FALSE; 3091 C->was_assembled = PETSC_FALSE; 3092 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 3093 mmdata->reusesym = PETSC_TRUE; 3094 C->offloadmask = PETSC_OFFLOAD_GPU; 3095 } 3096 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3097 PetscFunctionReturn(0); 3098 } 3099 3100 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 3101 3102 /* handles sparse or dense B */ 3103 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) { 3104 Mat_Product *product = mat->product; 3105 PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE; 3106 3107 PetscFunctionBegin; 3108 MatCheckProduct(mat, 1); 3109 PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense)); 3110 if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp)); 3111 if (product->type == MATPRODUCT_ABC) { 3112 Ciscusp = PETSC_FALSE; 3113 if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp)); 3114 } 3115 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 3116 PetscBool usecpu = PETSC_FALSE; 3117 switch (product->type) { 3118 case MATPRODUCT_AB: 3119 if (product->api_user) { 3120 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat"); 3121 PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3122 PetscOptionsEnd(); 3123 } else { 3124 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat"); 3125 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3126 PetscOptionsEnd(); 3127 } 3128 break; 3129 case MATPRODUCT_AtB: 3130 if (product->api_user) { 3131 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat"); 3132 PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3133 PetscOptionsEnd(); 3134 } else { 3135 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat"); 3136 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3137 PetscOptionsEnd(); 3138 } 3139 break; 3140 case MATPRODUCT_PtAP: 3141 if (product->api_user) { 3142 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat"); 3143 PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3144 PetscOptionsEnd(); 3145 } else { 3146 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat"); 3147 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3148 PetscOptionsEnd(); 3149 } 3150 break; 3151 case MATPRODUCT_RARt: 3152 if (product->api_user) { 3153 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat"); 3154 PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3155 PetscOptionsEnd(); 3156 } else { 3157 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat"); 3158 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3159 PetscOptionsEnd(); 3160 } 3161 break; 3162 case MATPRODUCT_ABC: 3163 if (product->api_user) { 3164 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat"); 3165 PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3166 PetscOptionsEnd(); 3167 } else { 3168 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat"); 3169 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3170 PetscOptionsEnd(); 3171 } 3172 break; 3173 default: break; 3174 } 3175 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 3176 } 3177 /* dispatch */ 3178 if (isdense) { 3179 switch (product->type) { 3180 case MATPRODUCT_AB: 3181 case MATPRODUCT_AtB: 3182 case MATPRODUCT_ABt: 3183 case MATPRODUCT_PtAP: 3184 case MATPRODUCT_RARt: 3185 if (product->A->boundtocpu) { 3186 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 3187 } else { 3188 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 3189 } 3190 break; 3191 case MATPRODUCT_ABC: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; break; 3192 default: break; 3193 } 3194 } else if (Biscusp && Ciscusp) { 3195 switch (product->type) { 3196 case MATPRODUCT_AB: 3197 case MATPRODUCT_AtB: 3198 case MATPRODUCT_ABt: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; break; 3199 case MATPRODUCT_PtAP: 3200 case MATPRODUCT_RARt: 3201 case MATPRODUCT_ABC: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; break; 3202 default: break; 3203 } 3204 } else { /* fallback for AIJ */ 3205 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 3206 } 3207 PetscFunctionReturn(0); 3208 } 3209 3210 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) { 3211 PetscFunctionBegin; 3212 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE)); 3213 PetscFunctionReturn(0); 3214 } 3215 3216 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) { 3217 PetscFunctionBegin; 3218 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE)); 3219 PetscFunctionReturn(0); 3220 } 3221 3222 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) { 3223 PetscFunctionBegin; 3224 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE)); 3225 PetscFunctionReturn(0); 3226 } 3227 3228 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) { 3229 PetscFunctionBegin; 3230 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE)); 3231 PetscFunctionReturn(0); 3232 } 3233 3234 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) { 3235 PetscFunctionBegin; 3236 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE)); 3237 PetscFunctionReturn(0); 3238 } 3239 3240 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y) { 3241 int i = blockIdx.x * blockDim.x + threadIdx.x; 3242 if (i < n) y[idx[i]] += x[i]; 3243 } 3244 3245 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 3246 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm) { 3247 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3248 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 3249 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3250 PetscScalar *xarray, *zarray, *dptr, *beta, *xptr; 3251 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3252 PetscBool compressed; 3253 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3254 PetscInt nx, ny; 3255 #endif 3256 3257 PetscFunctionBegin; 3258 PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported"); 3259 if (!a->nz) { 3260 if (!yy) PetscCall(VecSet_SeqCUDA(zz, 0)); 3261 else PetscCall(VecCopy_SeqCUDA(yy, zz)); 3262 PetscFunctionReturn(0); 3263 } 3264 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 3265 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3266 if (!trans) { 3267 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3268 PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3269 } else { 3270 if (herm || !A->form_explicit_transpose) { 3271 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3272 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3273 } else { 3274 if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3275 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 3276 } 3277 } 3278 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3279 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3280 3281 try { 3282 PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray)); 3283 if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get uptodate zarray on GPU */ 3284 else PetscCall(VecCUDAGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */ 3285 3286 PetscCall(PetscLogGpuTimeBegin()); 3287 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3288 /* z = A x + beta y. 3289 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3290 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3291 */ 3292 xptr = xarray; 3293 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3294 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3295 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3296 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3297 allocated to accommodate different uses. So we get the length info directly from mat. 3298 */ 3299 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3300 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3301 nx = mat->num_cols; 3302 ny = mat->num_rows; 3303 } 3304 #endif 3305 } else { 3306 /* z = A^T x + beta y 3307 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3308 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3309 */ 3310 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3311 dptr = zarray; 3312 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3313 if (compressed) { /* Scatter x to work vector */ 3314 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3315 thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3316 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse()); 3317 } 3318 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3319 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3320 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3321 nx = mat->num_rows; 3322 ny = mat->num_cols; 3323 } 3324 #endif 3325 } 3326 3327 /* csr_spmv does y = alpha op(A) x + beta y */ 3328 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3329 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3330 PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3331 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3332 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype)); 3333 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype)); 3334 PetscCallCUSPARSE( 3335 cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize)); 3336 PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize)); 3337 3338 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3339 } else { 3340 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3341 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr)); 3342 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr)); 3343 } 3344 3345 PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 3346 matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer)); 3347 #else 3348 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3349 PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr)); 3350 #endif 3351 } else { 3352 if (cusparsestruct->nrows) { 3353 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3354 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3355 #else 3356 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3357 PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr)); 3358 #endif 3359 } 3360 } 3361 PetscCall(PetscLogGpuTimeEnd()); 3362 3363 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3364 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3365 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3366 PetscCall(VecCopy_SeqCUDA(yy, zz)); /* zz = yy */ 3367 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3368 PetscCall(VecAXPY_SeqCUDA(zz, 1.0, yy)); /* zz += yy */ 3369 } 3370 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3371 PetscCall(VecSet_SeqCUDA(zz, 0)); 3372 } 3373 3374 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3375 if (compressed) { 3376 PetscCall(PetscLogGpuTimeBegin()); 3377 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3378 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3379 prevent that. So I just add a ScatterAdd kernel. 3380 */ 3381 #if 0 3382 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3383 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3384 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3385 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3386 VecCUDAPlusEquals()); 3387 #else 3388 PetscInt n = matstruct->cprowIndices->size(); 3389 ScatterAdd<<<(n + 255) / 256, 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray); 3390 #endif 3391 PetscCall(PetscLogGpuTimeEnd()); 3392 } 3393 } else { 3394 if (yy && yy != zz) { PetscCall(VecAXPY_SeqCUDA(zz, 1.0, yy)); /* zz += yy */ } 3395 } 3396 PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray)); 3397 if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray)); 3398 else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray)); 3399 } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); } 3400 if (yy) { 3401 PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 3402 } else { 3403 PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 3404 } 3405 PetscFunctionReturn(0); 3406 } 3407 3408 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) { 3409 PetscFunctionBegin; 3410 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE)); 3411 PetscFunctionReturn(0); 3412 } 3413 3414 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode) { 3415 PetscObjectState onnz = A->nonzerostate; 3416 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 3417 3418 PetscFunctionBegin; 3419 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 3420 if (onnz != A->nonzerostate && cusp->deviceMat) { 3421 PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n")); 3422 PetscCallCUDA(cudaFree(cusp->deviceMat)); 3423 cusp->deviceMat = NULL; 3424 } 3425 PetscFunctionReturn(0); 3426 } 3427 3428 /* --------------------------------------------------------------------------------*/ 3429 /*@ 3430 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 3431 (the default parallel PETSc format). This matrix will ultimately pushed down 3432 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 3433 assembly performance the user should preallocate the matrix storage by setting 3434 the parameter nz (or the array nnz). By setting these parameters accurately, 3435 performance during matrix assembly can be increased by more than a factor of 50. 3436 3437 Collective 3438 3439 Input Parameters: 3440 + comm - MPI communicator, set to PETSC_COMM_SELF 3441 . m - number of rows 3442 . n - number of columns 3443 . nz - number of nonzeros per row (same for all rows) 3444 - nnz - array containing the number of nonzeros in the various rows 3445 (possibly different for each row) or NULL 3446 3447 Output Parameter: 3448 . A - the matrix 3449 3450 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 3451 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3452 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 3453 3454 Notes: 3455 If nnz is given then nz is ignored 3456 3457 The AIJ format (also called the Yale sparse matrix format or 3458 compressed row storage), is fully compatible with standard Fortran 77 3459 storage. That is, the stored row and column indices can begin at 3460 either one (as in Fortran) or zero. See the users' manual for details. 3461 3462 Specify the preallocated storage with either nz or nnz (not both). 3463 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 3464 allocation. For large problems you MUST preallocate memory or you 3465 will get TERRIBLE performance, see the users' manual chapter on matrices. 3466 3467 By default, this format uses inodes (identical nodes) when possible, to 3468 improve numerical efficiency of matrix-vector products and solves. We 3469 search for consecutive rows with the same nonzero structure, thereby 3470 reusing matrix information to achieve increased efficiency. 3471 3472 Level: intermediate 3473 3474 .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE` 3475 @*/ 3476 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) { 3477 PetscFunctionBegin; 3478 PetscCall(MatCreate(comm, A)); 3479 PetscCall(MatSetSizes(*A, m, n, m, n)); 3480 PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE)); 3481 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 3482 PetscFunctionReturn(0); 3483 } 3484 3485 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) { 3486 PetscFunctionBegin; 3487 if (A->factortype == MAT_FACTOR_NONE) { 3488 PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE **)&A->spptr)); 3489 } else { 3490 PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr)); 3491 } 3492 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 3493 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL)); 3494 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL)); 3495 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 3496 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 3497 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 3498 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 3499 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 3500 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 3501 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL)); 3502 PetscCall(MatDestroy_SeqAIJ(A)); 3503 PetscFunctionReturn(0); 3504 } 3505 3506 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *); 3507 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool); 3508 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B) { 3509 PetscFunctionBegin; 3510 PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B)); 3511 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B)); 3512 PetscFunctionReturn(0); 3513 } 3514 3515 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str) { 3516 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 3517 Mat_SeqAIJCUSPARSE *cy; 3518 Mat_SeqAIJCUSPARSE *cx; 3519 PetscScalar *ay; 3520 const PetscScalar *ax; 3521 CsrMatrix *csry, *csrx; 3522 3523 PetscFunctionBegin; 3524 cy = (Mat_SeqAIJCUSPARSE *)Y->spptr; 3525 cx = (Mat_SeqAIJCUSPARSE *)X->spptr; 3526 if (X->ops->axpy != Y->ops->axpy) { 3527 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3528 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3529 PetscFunctionReturn(0); 3530 } 3531 /* if we are here, it means both matrices are bound to GPU */ 3532 PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y)); 3533 PetscCall(MatSeqAIJCUSPARSECopyToGPU(X)); 3534 PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3535 PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3536 csry = (CsrMatrix *)cy->mat->mat; 3537 csrx = (CsrMatrix *)cx->mat->mat; 3538 /* see if we can turn this into a cublas axpy */ 3539 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3540 bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin()); 3541 if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin()); 3542 if (eq) str = SAME_NONZERO_PATTERN; 3543 } 3544 /* spgeam is buggy with one column */ 3545 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3546 3547 if (str == SUBSET_NONZERO_PATTERN) { 3548 PetscScalar b = 1.0; 3549 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3550 size_t bufferSize; 3551 void *buffer; 3552 #endif 3553 3554 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 3555 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3556 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST)); 3557 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3558 PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3559 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize)); 3560 PetscCallCUDA(cudaMalloc(&buffer, bufferSize)); 3561 PetscCall(PetscLogGpuTimeBegin()); 3562 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3563 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer)); 3564 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3565 PetscCall(PetscLogGpuTimeEnd()); 3566 PetscCallCUDA(cudaFree(buffer)); 3567 #else 3568 PetscCall(PetscLogGpuTimeBegin()); 3569 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3570 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get())); 3571 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3572 PetscCall(PetscLogGpuTimeEnd()); 3573 #endif 3574 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3575 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 3576 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3577 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3578 } else if (str == SAME_NONZERO_PATTERN) { 3579 cublasHandle_t cublasv2handle; 3580 PetscBLASInt one = 1, bnz = 1; 3581 3582 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 3583 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3584 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3585 PetscCall(PetscBLASIntCast(x->nz, &bnz)); 3586 PetscCall(PetscLogGpuTimeBegin()); 3587 PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one)); 3588 PetscCall(PetscLogGpuFlops(2.0 * bnz)); 3589 PetscCall(PetscLogGpuTimeEnd()); 3590 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 3591 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3592 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3593 } else { 3594 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3595 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3596 } 3597 PetscFunctionReturn(0); 3598 } 3599 3600 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a) { 3601 Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data; 3602 PetscScalar *ay; 3603 cublasHandle_t cublasv2handle; 3604 PetscBLASInt one = 1, bnz = 1; 3605 3606 PetscFunctionBegin; 3607 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3608 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3609 PetscCall(PetscBLASIntCast(y->nz, &bnz)); 3610 PetscCall(PetscLogGpuTimeBegin()); 3611 PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one)); 3612 PetscCall(PetscLogGpuFlops(bnz)); 3613 PetscCall(PetscLogGpuTimeEnd()); 3614 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3615 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3616 PetscFunctionReturn(0); 3617 } 3618 3619 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) { 3620 PetscBool both = PETSC_FALSE; 3621 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3622 3623 PetscFunctionBegin; 3624 if (A->factortype == MAT_FACTOR_NONE) { 3625 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr; 3626 if (spptr->mat) { 3627 CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat; 3628 if (matrix->values) { 3629 both = PETSC_TRUE; 3630 thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3631 } 3632 } 3633 if (spptr->matTranspose) { 3634 CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat; 3635 if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3636 } 3637 } 3638 PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n])); 3639 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 3640 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3641 else A->offloadmask = PETSC_OFFLOAD_CPU; 3642 PetscFunctionReturn(0); 3643 } 3644 3645 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg) { 3646 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3647 3648 PetscFunctionBegin; 3649 if (A->factortype != MAT_FACTOR_NONE) { 3650 A->boundtocpu = flg; 3651 PetscFunctionReturn(0); 3652 } 3653 if (flg) { 3654 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 3655 3656 A->ops->scale = MatScale_SeqAIJ; 3657 A->ops->axpy = MatAXPY_SeqAIJ; 3658 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3659 A->ops->mult = MatMult_SeqAIJ; 3660 A->ops->multadd = MatMultAdd_SeqAIJ; 3661 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3662 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3663 A->ops->multhermitiantranspose = NULL; 3664 A->ops->multhermitiantransposeadd = NULL; 3665 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3666 PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps))); 3667 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 3668 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 3669 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 3670 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 3671 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 3672 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 3673 } else { 3674 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3675 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3676 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3677 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3678 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3679 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3680 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3681 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3682 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3683 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3684 a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 3685 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 3686 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 3687 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 3688 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 3689 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 3690 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE; 3691 3692 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE)); 3693 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 3694 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 3695 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE)); 3696 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE)); 3697 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 3698 } 3699 A->boundtocpu = flg; 3700 if (flg && a->inode.size) { 3701 a->inode.use = PETSC_TRUE; 3702 } else { 3703 a->inode.use = PETSC_FALSE; 3704 } 3705 PetscFunctionReturn(0); 3706 } 3707 3708 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) { 3709 Mat B; 3710 3711 PetscFunctionBegin; 3712 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */ 3713 if (reuse == MAT_INITIAL_MATRIX) { 3714 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); 3715 } else if (reuse == MAT_REUSE_MATRIX) { 3716 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); 3717 } 3718 B = *newmat; 3719 3720 PetscCall(PetscFree(B->defaultvectype)); 3721 PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype)); 3722 3723 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3724 if (B->factortype == MAT_FACTOR_NONE) { 3725 Mat_SeqAIJCUSPARSE *spptr; 3726 PetscCall(PetscNew(&spptr)); 3727 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 3728 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 3729 spptr->format = MAT_CUSPARSE_CSR; 3730 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3731 #if CUSPARSE_VERSION > 11301 3732 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 3733 #else 3734 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3735 #endif 3736 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3737 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3738 #endif 3739 B->spptr = spptr; 3740 } else { 3741 Mat_SeqAIJCUSPARSETriFactors *spptr; 3742 3743 PetscCall(PetscNew(&spptr)); 3744 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 3745 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 3746 B->spptr = spptr; 3747 } 3748 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3749 } 3750 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3751 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3752 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 3753 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3754 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3755 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3756 3757 PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE)); 3758 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE)); 3759 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE)); 3760 #if defined(PETSC_HAVE_HYPRE) 3761 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE)); 3762 #endif 3763 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE)); 3764 PetscFunctionReturn(0); 3765 } 3766 3767 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) { 3768 PetscFunctionBegin; 3769 PetscCall(MatCreate_SeqAIJ(B)); 3770 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B)); 3771 PetscFunctionReturn(0); 3772 } 3773 3774 /*MC 3775 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3776 3777 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3778 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3779 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3780 3781 Options Database Keys: 3782 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3783 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3784 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3785 + -mat_cusparse_use_cpu_solve - Do MatSolve on CPU 3786 3787 Level: beginner 3788 3789 .seealso: `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 3790 M*/ 3791 3792 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat, MatFactorType, Mat *); 3793 3794 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) { 3795 PetscFunctionBegin; 3796 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse_band)); 3797 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse)); 3798 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse)); 3799 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse)); 3800 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse)); 3801 3802 PetscFunctionReturn(0); 3803 } 3804 3805 static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat) { 3806 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)mat->spptr; 3807 3808 PetscFunctionBegin; 3809 if (!cusp) PetscFunctionReturn(0); 3810 delete cusp->cooPerm; 3811 delete cusp->cooPerm_a; 3812 cusp->cooPerm = NULL; 3813 cusp->cooPerm_a = NULL; 3814 if (cusp->use_extended_coo) { 3815 PetscCallCUDA(cudaFree(cusp->jmap_d)); 3816 PetscCallCUDA(cudaFree(cusp->perm_d)); 3817 } 3818 cusp->use_extended_coo = PETSC_FALSE; 3819 PetscFunctionReturn(0); 3820 } 3821 3822 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) { 3823 PetscFunctionBegin; 3824 if (*cusparsestruct) { 3825 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat, (*cusparsestruct)->format)); 3826 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose, (*cusparsestruct)->format)); 3827 delete (*cusparsestruct)->workVector; 3828 delete (*cusparsestruct)->rowoffsets_gpu; 3829 delete (*cusparsestruct)->cooPerm; 3830 delete (*cusparsestruct)->cooPerm_a; 3831 delete (*cusparsestruct)->csr2csc_i; 3832 if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle)); 3833 if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d)); 3834 if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d)); 3835 PetscCall(PetscFree(*cusparsestruct)); 3836 } 3837 PetscFunctionReturn(0); 3838 } 3839 3840 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) { 3841 PetscFunctionBegin; 3842 if (*mat) { 3843 delete (*mat)->values; 3844 delete (*mat)->column_indices; 3845 delete (*mat)->row_offsets; 3846 delete *mat; 3847 *mat = 0; 3848 } 3849 PetscFunctionReturn(0); 3850 } 3851 3852 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) { 3853 PetscFunctionBegin; 3854 if (*trifactor) { 3855 if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr)); 3856 if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo)); 3857 PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 3858 if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer)); 3859 if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h)); 3860 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3861 if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer)); 3862 #endif 3863 PetscCall(PetscFree(*trifactor)); 3864 } 3865 PetscFunctionReturn(0); 3866 } 3867 3868 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format) { 3869 CsrMatrix *mat; 3870 3871 PetscFunctionBegin; 3872 if (*matstruct) { 3873 if ((*matstruct)->mat) { 3874 if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) { 3875 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3876 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3877 #else 3878 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3879 PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat)); 3880 #endif 3881 } else { 3882 mat = (CsrMatrix *)(*matstruct)->mat; 3883 CsrMatrix_Destroy(&mat); 3884 } 3885 } 3886 if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr)); 3887 delete (*matstruct)->cprowIndices; 3888 if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one)); 3889 if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero)); 3890 if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one)); 3891 3892 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3893 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3894 if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr)); 3895 for (int i = 0; i < 3; i++) { 3896 if (mdata->cuSpMV[i].initialized) { 3897 PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer)); 3898 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr)); 3899 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr)); 3900 } 3901 } 3902 #endif 3903 delete *matstruct; 3904 *matstruct = NULL; 3905 } 3906 PetscFunctionReturn(0); 3907 } 3908 3909 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors) { 3910 Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors; 3911 3912 PetscFunctionBegin; 3913 if (fs) { 3914 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr)); 3915 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr)); 3916 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose)); 3917 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose)); 3918 delete fs->rpermIndices; 3919 delete fs->cpermIndices; 3920 delete fs->workVector; 3921 fs->rpermIndices = NULL; 3922 fs->cpermIndices = NULL; 3923 fs->workVector = NULL; 3924 if (fs->a_band_d) PetscCallCUDA(cudaFree(fs->a_band_d)); 3925 if (fs->i_band_d) PetscCallCUDA(cudaFree(fs->i_band_d)); 3926 fs->init_dev_prop = PETSC_FALSE; 3927 #if CUSPARSE_VERSION >= 11500 3928 PetscCallCUDA(cudaFree(fs->csrRowPtr)); 3929 PetscCallCUDA(cudaFree(fs->csrColIdx)); 3930 PetscCallCUDA(cudaFree(fs->csrVal)); 3931 PetscCallCUDA(cudaFree(fs->X)); 3932 PetscCallCUDA(cudaFree(fs->Y)); 3933 // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */ 3934 PetscCallCUDA(cudaFree(fs->spsvBuffer_L)); 3935 PetscCallCUDA(cudaFree(fs->spsvBuffer_U)); 3936 PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt)); 3937 PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut)); 3938 PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M)); 3939 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L)); 3940 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U)); 3941 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L)); 3942 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt)); 3943 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U)); 3944 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut)); 3945 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X)); 3946 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y)); 3947 PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M)); 3948 PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M)); 3949 3950 fs->createdTransposeSpSVDescr = PETSC_FALSE; 3951 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 3952 #endif 3953 } 3954 PetscFunctionReturn(0); 3955 } 3956 3957 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors) { 3958 cusparseHandle_t handle; 3959 3960 PetscFunctionBegin; 3961 if (*trifactors) { 3962 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors)); 3963 if (handle = (*trifactors)->handle) PetscCallCUSPARSE(cusparseDestroy(handle)); 3964 PetscCall(PetscFree(*trifactors)); 3965 } 3966 PetscFunctionReturn(0); 3967 } 3968 3969 struct IJCompare { 3970 __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) { 3971 if (t1.get<0>() < t2.get<0>()) return true; 3972 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3973 return false; 3974 } 3975 }; 3976 3977 struct IJEqual { 3978 __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) { 3979 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3980 return true; 3981 } 3982 }; 3983 3984 struct IJDiff { 3985 __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 == t2 ? 0 : 1; } 3986 }; 3987 3988 struct IJSum { 3989 __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 || t2; } 3990 }; 3991 3992 #include <thrust/iterator/discard_iterator.h> 3993 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */ 3994 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) { 3995 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 3996 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3997 THRUSTARRAY *cooPerm_v = NULL; 3998 thrust::device_ptr<const PetscScalar> d_v; 3999 CsrMatrix *matrix; 4000 PetscInt n; 4001 4002 PetscFunctionBegin; 4003 PetscCheck(cusp, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE struct"); 4004 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE CsrMatrix"); 4005 if (!cusp->cooPerm) { 4006 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 4007 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 4008 PetscFunctionReturn(0); 4009 } 4010 matrix = (CsrMatrix *)cusp->mat->mat; 4011 PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4012 if (!v) { 4013 if (imode == INSERT_VALUES) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 4014 goto finalize; 4015 } 4016 n = cusp->cooPerm->size(); 4017 if (isCudaMem(v)) { 4018 d_v = thrust::device_pointer_cast(v); 4019 } else { 4020 cooPerm_v = new THRUSTARRAY(n); 4021 cooPerm_v->assign(v, v + n); 4022 d_v = cooPerm_v->data(); 4023 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 4024 } 4025 PetscCall(PetscLogGpuTimeBegin()); 4026 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 4027 if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 4028 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 4029 auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()); 4030 /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 4031 cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 4032 cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 4033 */ 4034 thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), cooPerm_w->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>()); 4035 thrust::transform(cooPerm_w->begin(), cooPerm_w->end(), matrix->values->begin(), matrix->values->begin(), thrust::plus<PetscScalar>()); 4036 delete cooPerm_w; 4037 } else { 4038 /* all nonzeros in d_v[] are unique entries */ 4039 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin())); 4040 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end())); 4041 thrust::for_each(zibit, zieit, VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 4042 } 4043 } else { 4044 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 4045 auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()); 4046 thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), matrix->values->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>()); 4047 } else { 4048 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin())); 4049 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end())); 4050 thrust::for_each(zibit, zieit, VecCUDAEquals()); 4051 } 4052 } 4053 PetscCall(PetscLogGpuTimeEnd()); 4054 finalize: 4055 delete cooPerm_v; 4056 A->offloadmask = PETSC_OFFLOAD_GPU; 4057 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4058 /* shorter version of MatAssemblyEnd_SeqAIJ */ 4059 PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", A->rmap->n, A->cmap->n, a->nz)); 4060 PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is 0\n")); 4061 PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rmax)); 4062 a->reallocs = 0; 4063 A->info.mallocs += 0; 4064 A->info.nz_unneeded = 0; 4065 A->assembled = A->was_assembled = PETSC_TRUE; 4066 A->num_ass++; 4067 PetscFunctionReturn(0); 4068 } 4069 4070 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) { 4071 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4072 4073 PetscFunctionBegin; 4074 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4075 if (!cusp) PetscFunctionReturn(0); 4076 if (destroy) { 4077 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 4078 delete cusp->csr2csc_i; 4079 cusp->csr2csc_i = NULL; 4080 } 4081 A->transupdated = PETSC_FALSE; 4082 PetscFunctionReturn(0); 4083 } 4084 4085 #include <thrust/binary_search.h> 4086 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */ 4087 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, PetscInt coo_i[], PetscInt coo_j[]) { 4088 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4089 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4090 PetscInt cooPerm_n, nzr = 0; 4091 4092 PetscFunctionBegin; 4093 PetscCall(PetscLayoutSetUp(A->rmap)); 4094 PetscCall(PetscLayoutSetUp(A->cmap)); 4095 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 4096 if (n != cooPerm_n) { 4097 delete cusp->cooPerm; 4098 delete cusp->cooPerm_a; 4099 cusp->cooPerm = NULL; 4100 cusp->cooPerm_a = NULL; 4101 } 4102 if (n) { 4103 thrust::device_ptr<PetscInt> d_i, d_j; 4104 PetscInt *d_raw_i, *d_raw_j; 4105 PetscBool free_raw_i = PETSC_FALSE, free_raw_j = PETSC_FALSE; 4106 PetscMemType imtype, jmtype; 4107 4108 PetscCall(PetscGetMemType(coo_i, &imtype)); 4109 if (PetscMemTypeHost(imtype)) { 4110 PetscCallCUDA(cudaMalloc(&d_raw_i, sizeof(PetscInt) * n)); 4111 PetscCallCUDA(cudaMemcpy(d_raw_i, coo_i, sizeof(PetscInt) * n, cudaMemcpyHostToDevice)); 4112 d_i = thrust::device_pointer_cast(d_raw_i); 4113 free_raw_i = PETSC_TRUE; 4114 PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt))); 4115 } else { 4116 d_i = thrust::device_pointer_cast(coo_i); 4117 } 4118 4119 PetscCall(PetscGetMemType(coo_j, &jmtype)); 4120 if (PetscMemTypeHost(jmtype)) { // MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic() passes device coo_i[] and host coo_j[]! 4121 PetscCallCUDA(cudaMalloc(&d_raw_j, sizeof(PetscInt) * n)); 4122 PetscCallCUDA(cudaMemcpy(d_raw_j, coo_j, sizeof(PetscInt) * n, cudaMemcpyHostToDevice)); 4123 d_j = thrust::device_pointer_cast(d_raw_j); 4124 free_raw_j = PETSC_TRUE; 4125 PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt))); 4126 } else { 4127 d_j = thrust::device_pointer_cast(coo_j); 4128 } 4129 4130 THRUSTINTARRAY ii(A->rmap->n); 4131 4132 if (!cusp->cooPerm) cusp->cooPerm = new THRUSTINTARRAY(n); 4133 if (!cusp->cooPerm_a) cusp->cooPerm_a = new THRUSTINTARRAY(n); 4134 4135 /* Ex. 4136 n = 6 4137 coo_i = [3,3,1,4,1,4] 4138 coo_j = [3,2,2,5,2,6] 4139 */ 4140 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i, d_j)); 4141 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i + n, d_j + n)); 4142 4143 PetscCall(PetscLogGpuTimeBegin()); 4144 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 4145 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 4146 (*cusp->cooPerm_a).assign(d_i, d_i + n); /* copy the sorted array */ 4147 THRUSTINTARRAY w(d_j, d_j + n); 4148 4149 /* 4150 d_i = [1,1,3,3,4,4] 4151 d_j = [2,2,2,3,5,6] 4152 cooPerm = [2,4,1,0,3,5] 4153 */ 4154 auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 4155 4156 /* 4157 d_i = [1,3,3,4,4,x] 4158 ^ekey 4159 d_j = [2,2,3,5,6,x] 4160 ^nekye 4161 */ 4162 if (nekey == ekey) { /* all entries are unique */ 4163 delete cusp->cooPerm_a; 4164 cusp->cooPerm_a = NULL; 4165 } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 4166 /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 4167 adjacent_difference(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 4168 adjacent_difference(w.begin(), w.end(), w.begin(), IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 4169 (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 4170 w[0] = 0; 4171 thrust::transform(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), w.begin(), cusp->cooPerm_a->begin(), IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 4172 thrust::inclusive_scan(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 4173 } 4174 thrust::counting_iterator<PetscInt> search_begin(0); 4175 thrust::upper_bound(d_i, nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 4176 search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 4177 ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 4178 PetscCall(PetscLogGpuTimeEnd()); 4179 4180 PetscCall(MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i)); 4181 a->singlemalloc = PETSC_FALSE; 4182 a->free_a = PETSC_TRUE; 4183 a->free_ij = PETSC_TRUE; 4184 PetscCall(PetscMalloc1(A->rmap->n + 1, &a->i)); 4185 a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 4186 PetscCallCUDA(cudaMemcpy(a->i + 1, ii.data().get(), A->rmap->n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4187 a->nz = a->maxnz = a->i[A->rmap->n]; 4188 a->rmax = 0; 4189 PetscCall(PetscMalloc1(a->nz, &a->a)); 4190 PetscCall(PetscMalloc1(a->nz, &a->j)); 4191 PetscCallCUDA(cudaMemcpy(a->j, thrust::raw_pointer_cast(d_j), a->nz * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4192 if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n, &a->ilen)); 4193 if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n, &a->imax)); 4194 for (PetscInt i = 0; i < A->rmap->n; i++) { 4195 const PetscInt nnzr = a->i[i + 1] - a->i[i]; 4196 nzr += (PetscInt) !!(nnzr); 4197 a->ilen[i] = a->imax[i] = nnzr; 4198 a->rmax = PetscMax(a->rmax, nnzr); 4199 } 4200 a->nonzerorowcnt = nzr; 4201 A->preallocated = PETSC_TRUE; 4202 PetscCall(PetscLogGpuToCpu((A->rmap->n + a->nz) * sizeof(PetscInt))); 4203 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 4204 if (free_raw_i) PetscCallCUDA(cudaFree(d_raw_i)); 4205 if (free_raw_j) PetscCallCUDA(cudaFree(d_raw_j)); 4206 } else { 4207 PetscCall(MatSeqAIJSetPreallocation(A, 0, NULL)); 4208 } 4209 PetscCall(MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE)); 4210 4211 /* We want to allocate the CUSPARSE struct for matvec now. 4212 The code is so convoluted now that I prefer to copy zeros */ 4213 PetscCall(PetscArrayzero(a->a, a->nz)); 4214 PetscCall(MatCheckCompressedRow(A, nzr, &a->compressedrow, a->i, A->rmap->n, 0.6)); 4215 A->offloadmask = PETSC_OFFLOAD_CPU; 4216 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4217 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 4218 PetscFunctionReturn(0); 4219 } 4220 4221 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) { 4222 Mat_SeqAIJ *seq; 4223 Mat_SeqAIJCUSPARSE *dev; 4224 PetscBool coo_basic = PETSC_TRUE; 4225 PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 4226 4227 PetscFunctionBegin; 4228 PetscCall(MatResetPreallocationCOO_SeqAIJ(mat)); 4229 PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat)); 4230 if (coo_i) { 4231 PetscCall(PetscGetMemType(coo_i, &mtype)); 4232 if (PetscMemTypeHost(mtype)) { 4233 for (PetscCount k = 0; k < coo_n; k++) { 4234 if (coo_i[k] < 0 || coo_j[k] < 0) { 4235 coo_basic = PETSC_FALSE; 4236 break; 4237 } 4238 } 4239 } 4240 } 4241 4242 if (coo_basic) { /* i,j are on device or do not contain negative indices */ 4243 PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j)); 4244 } else { 4245 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 4246 mat->offloadmask = PETSC_OFFLOAD_CPU; 4247 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat)); 4248 seq = static_cast<Mat_SeqAIJ *>(mat->data); 4249 dev = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr); 4250 PetscCallCUDA(cudaMalloc((void **)&dev->jmap_d, (seq->nz + 1) * sizeof(PetscCount))); 4251 PetscCallCUDA(cudaMemcpy(dev->jmap_d, seq->jmap, (seq->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4252 PetscCallCUDA(cudaMalloc((void **)&dev->perm_d, seq->Atot * sizeof(PetscCount))); 4253 PetscCallCUDA(cudaMemcpy(dev->perm_d, seq->perm, seq->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4254 dev->use_extended_coo = PETSC_TRUE; 4255 } 4256 PetscFunctionReturn(0); 4257 } 4258 4259 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[]) { 4260 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 4261 const PetscCount grid_size = gridDim.x * blockDim.x; 4262 for (; i < nnz; i += grid_size) { 4263 PetscScalar sum = 0.0; 4264 for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]]; 4265 a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum; 4266 } 4267 } 4268 4269 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) { 4270 Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data; 4271 Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE *)A->spptr; 4272 PetscCount Annz = seq->nz; 4273 PetscMemType memtype; 4274 const PetscScalar *v1 = v; 4275 PetscScalar *Aa; 4276 4277 PetscFunctionBegin; 4278 if (dev->use_extended_coo) { 4279 PetscCall(PetscGetMemType(v, &memtype)); 4280 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 4281 PetscCallCUDA(cudaMalloc((void **)&v1, seq->coo_n * sizeof(PetscScalar))); 4282 PetscCallCUDA(cudaMemcpy((void *)v1, v, seq->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4283 } 4284 4285 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); 4286 else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); 4287 4288 if (Annz) { 4289 MatAddCOOValues<<<(Annz + 255) / 256, 256>>>(v1, Annz, dev->jmap_d, dev->perm_d, imode, Aa); 4290 PetscCallCUDA(cudaPeekAtLastError()); 4291 } 4292 4293 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa)); 4294 else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa)); 4295 4296 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1)); 4297 } else { 4298 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A, v, imode)); 4299 } 4300 PetscFunctionReturn(0); 4301 } 4302 4303 /*@C 4304 MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices. 4305 4306 Not collective 4307 4308 Input Parameters: 4309 + A - the matrix 4310 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4311 4312 Output Parameters: 4313 + ia - the CSR row pointers 4314 - ja - the CSR column indices 4315 4316 Level: developer 4317 4318 Notes: 4319 When compressed is true, the CSR structure does not contain empty rows 4320 4321 .seealso: `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()` 4322 @*/ 4323 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j) { 4324 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4325 CsrMatrix *csr; 4326 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4327 4328 PetscFunctionBegin; 4329 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4330 if (!i || !j) PetscFunctionReturn(0); 4331 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4332 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4333 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4334 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4335 csr = (CsrMatrix *)cusp->mat->mat; 4336 if (i) { 4337 if (!compressed && a->compressedrow.use) { /* need full row offset */ 4338 if (!cusp->rowoffsets_gpu) { 4339 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4340 cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 4341 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4342 } 4343 *i = cusp->rowoffsets_gpu->data().get(); 4344 } else *i = csr->row_offsets->data().get(); 4345 } 4346 if (j) *j = csr->column_indices->data().get(); 4347 PetscFunctionReturn(0); 4348 } 4349 4350 /*@C 4351 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ() 4352 4353 Not collective 4354 4355 Input Parameters: 4356 + A - the matrix 4357 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4358 4359 Output Parameters: 4360 + ia - the CSR row pointers 4361 - ja - the CSR column indices 4362 4363 Level: developer 4364 4365 .seealso: `MatSeqAIJCUSPARSEGetIJ()` 4366 @*/ 4367 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j) { 4368 PetscFunctionBegin; 4369 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4370 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4371 if (i) *i = NULL; 4372 if (j) *j = NULL; 4373 PetscFunctionReturn(0); 4374 } 4375 4376 /*@C 4377 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4378 4379 Not Collective 4380 4381 Input Parameter: 4382 . A - a MATSEQAIJCUSPARSE matrix 4383 4384 Output Parameter: 4385 . a - pointer to the device data 4386 4387 Level: developer 4388 4389 Notes: may trigger host-device copies if up-to-date matrix data is on host 4390 4391 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()` 4392 @*/ 4393 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a) { 4394 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4395 CsrMatrix *csr; 4396 4397 PetscFunctionBegin; 4398 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4399 PetscValidPointer(a, 2); 4400 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4401 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4402 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4403 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4404 csr = (CsrMatrix *)cusp->mat->mat; 4405 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4406 *a = csr->values->data().get(); 4407 PetscFunctionReturn(0); 4408 } 4409 4410 /*@C 4411 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead() 4412 4413 Not Collective 4414 4415 Input Parameter: 4416 . A - a MATSEQAIJCUSPARSE matrix 4417 4418 Output Parameter: 4419 . a - pointer to the device data 4420 4421 Level: developer 4422 4423 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()` 4424 @*/ 4425 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a) { 4426 PetscFunctionBegin; 4427 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4428 PetscValidPointer(a, 2); 4429 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4430 *a = NULL; 4431 PetscFunctionReturn(0); 4432 } 4433 4434 /*@C 4435 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4436 4437 Not Collective 4438 4439 Input Parameter: 4440 . A - a MATSEQAIJCUSPARSE matrix 4441 4442 Output Parameter: 4443 . a - pointer to the device data 4444 4445 Level: developer 4446 4447 Notes: may trigger host-device copies if up-to-date matrix data is on host 4448 4449 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()` 4450 @*/ 4451 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a) { 4452 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4453 CsrMatrix *csr; 4454 4455 PetscFunctionBegin; 4456 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4457 PetscValidPointer(a, 2); 4458 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4459 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4460 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4461 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4462 csr = (CsrMatrix *)cusp->mat->mat; 4463 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4464 *a = csr->values->data().get(); 4465 A->offloadmask = PETSC_OFFLOAD_GPU; 4466 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4467 PetscFunctionReturn(0); 4468 } 4469 /*@C 4470 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray() 4471 4472 Not Collective 4473 4474 Input Parameter: 4475 . A - a MATSEQAIJCUSPARSE matrix 4476 4477 Output Parameter: 4478 . a - pointer to the device data 4479 4480 Level: developer 4481 4482 .seealso: `MatSeqAIJCUSPARSEGetArray()` 4483 @*/ 4484 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a) { 4485 PetscFunctionBegin; 4486 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4487 PetscValidPointer(a, 2); 4488 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4489 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4490 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4491 *a = NULL; 4492 PetscFunctionReturn(0); 4493 } 4494 4495 /*@C 4496 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4497 4498 Not Collective 4499 4500 Input Parameter: 4501 . A - a MATSEQAIJCUSPARSE matrix 4502 4503 Output Parameter: 4504 . a - pointer to the device data 4505 4506 Level: developer 4507 4508 Notes: does not trigger host-device copies and flags data validity on the GPU 4509 4510 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()` 4511 @*/ 4512 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a) { 4513 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4514 CsrMatrix *csr; 4515 4516 PetscFunctionBegin; 4517 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4518 PetscValidPointer(a, 2); 4519 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4520 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4521 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4522 csr = (CsrMatrix *)cusp->mat->mat; 4523 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4524 *a = csr->values->data().get(); 4525 A->offloadmask = PETSC_OFFLOAD_GPU; 4526 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4527 PetscFunctionReturn(0); 4528 } 4529 4530 /*@C 4531 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite() 4532 4533 Not Collective 4534 4535 Input Parameter: 4536 . A - a MATSEQAIJCUSPARSE matrix 4537 4538 Output Parameter: 4539 . a - pointer to the device data 4540 4541 Level: developer 4542 4543 .seealso: `MatSeqAIJCUSPARSEGetArrayWrite()` 4544 @*/ 4545 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a) { 4546 PetscFunctionBegin; 4547 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4548 PetscValidPointer(a, 2); 4549 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4550 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4551 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4552 *a = NULL; 4553 PetscFunctionReturn(0); 4554 } 4555 4556 struct IJCompare4 { 4557 __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) { 4558 if (t1.get<0>() < t2.get<0>()) return true; 4559 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4560 return false; 4561 } 4562 }; 4563 4564 struct Shift { 4565 int _shift; 4566 4567 Shift(int shift) : _shift(shift) { } 4568 __host__ __device__ inline int operator()(const int &c) { return c + _shift; } 4569 }; 4570 4571 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 4572 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) { 4573 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 4574 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp; 4575 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4576 CsrMatrix *Acsr, *Bcsr, *Ccsr; 4577 PetscInt Annz, Bnnz; 4578 cusparseStatus_t stat; 4579 PetscInt i, m, n, zero = 0; 4580 4581 PetscFunctionBegin; 4582 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4583 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 4584 PetscValidPointer(C, 4); 4585 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4586 PetscCheckTypeName(B, MATSEQAIJCUSPARSE); 4587 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 4588 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 4589 PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4590 PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4591 if (reuse == MAT_INITIAL_MATRIX) { 4592 m = A->rmap->n; 4593 n = A->cmap->n + B->cmap->n; 4594 PetscCall(MatCreate(PETSC_COMM_SELF, C)); 4595 PetscCall(MatSetSizes(*C, m, n, m, n)); 4596 PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE)); 4597 c = (Mat_SeqAIJ *)(*C)->data; 4598 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4599 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4600 Ccsr = new CsrMatrix; 4601 Cmat->cprowIndices = NULL; 4602 c->compressedrow.use = PETSC_FALSE; 4603 c->compressedrow.nrows = 0; 4604 c->compressedrow.i = NULL; 4605 c->compressedrow.rindex = NULL; 4606 Ccusp->workVector = NULL; 4607 Ccusp->nrows = m; 4608 Ccusp->mat = Cmat; 4609 Ccusp->mat->mat = Ccsr; 4610 Ccsr->num_rows = m; 4611 Ccsr->num_cols = n; 4612 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 4613 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 4614 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4615 PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar))); 4616 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar))); 4617 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 4618 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4619 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4620 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4621 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4622 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4623 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4624 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4625 4626 Acsr = (CsrMatrix *)Acusp->mat->mat; 4627 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4628 Annz = (PetscInt)Acsr->column_indices->size(); 4629 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4630 c->nz = Annz + Bnnz; 4631 Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1); 4632 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4633 Ccsr->values = new THRUSTARRAY(c->nz); 4634 Ccsr->num_entries = c->nz; 4635 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4636 if (c->nz) { 4637 auto Acoo = new THRUSTINTARRAY32(Annz); 4638 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4639 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4640 THRUSTINTARRAY32 *Aroff, *Broff; 4641 4642 if (a->compressedrow.use) { /* need full row offset */ 4643 if (!Acusp->rowoffsets_gpu) { 4644 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4645 Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 4646 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4647 } 4648 Aroff = Acusp->rowoffsets_gpu; 4649 } else Aroff = Acsr->row_offsets; 4650 if (b->compressedrow.use) { /* need full row offset */ 4651 if (!Bcusp->rowoffsets_gpu) { 4652 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4653 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 4654 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 4655 } 4656 Broff = Bcusp->rowoffsets_gpu; 4657 } else Broff = Bcsr->row_offsets; 4658 PetscCall(PetscLogGpuTimeBegin()); 4659 stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4660 PetscCallCUSPARSE(stat); 4661 stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4662 PetscCallCUSPARSE(stat); 4663 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4664 auto Aperm = thrust::make_constant_iterator(1); 4665 auto Bperm = thrust::make_constant_iterator(0); 4666 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0) 4667 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n)); 4668 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n)); 4669 #else 4670 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4671 auto Bcib = Bcsr->column_indices->begin(); 4672 auto Bcie = Bcsr->column_indices->end(); 4673 thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n)); 4674 #endif 4675 auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz); 4676 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm)); 4677 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm)); 4678 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm)); 4679 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm)); 4680 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin())); 4681 auto p1 = Ccusp->cooPerm->begin(); 4682 auto p2 = Ccusp->cooPerm->begin(); 4683 thrust::advance(p2, Annz); 4684 PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4())); 4685 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0) 4686 thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n)); 4687 #endif 4688 auto cci = thrust::make_counting_iterator(zero); 4689 auto cce = thrust::make_counting_iterator(c->nz); 4690 #if 0 //Errors on SUMMIT cuda 11.1.0 4691 PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4692 #else 4693 auto pred = thrust::identity<int>(); 4694 PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred)); 4695 PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred)); 4696 #endif 4697 stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4698 PetscCallCUSPARSE(stat); 4699 PetscCall(PetscLogGpuTimeEnd()); 4700 delete wPerm; 4701 delete Acoo; 4702 delete Bcoo; 4703 delete Ccoo; 4704 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4705 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 4706 PetscCallCUSPARSE(stat); 4707 #endif 4708 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4709 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 4710 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 4711 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4712 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4713 CsrMatrix *CcsrT = new CsrMatrix; 4714 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4715 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4716 4717 (*C)->form_explicit_transpose = PETSC_TRUE; 4718 (*C)->transupdated = PETSC_TRUE; 4719 Ccusp->rowoffsets_gpu = NULL; 4720 CmatT->cprowIndices = NULL; 4721 CmatT->mat = CcsrT; 4722 CcsrT->num_rows = n; 4723 CcsrT->num_cols = m; 4724 CcsrT->num_entries = c->nz; 4725 4726 CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1); 4727 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4728 CcsrT->values = new THRUSTARRAY(c->nz); 4729 4730 PetscCall(PetscLogGpuTimeBegin()); 4731 auto rT = CcsrT->row_offsets->begin(); 4732 if (AT) { 4733 rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT); 4734 thrust::advance(rT, -1); 4735 } 4736 if (BT) { 4737 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz)); 4738 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz)); 4739 thrust::copy(titb, tite, rT); 4740 } 4741 auto cT = CcsrT->column_indices->begin(); 4742 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT); 4743 if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT); 4744 auto vT = CcsrT->values->begin(); 4745 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4746 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 4747 PetscCall(PetscLogGpuTimeEnd()); 4748 4749 PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr)); 4750 PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO)); 4751 PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4752 PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one), sizeof(PetscScalar))); 4753 PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero), sizeof(PetscScalar))); 4754 PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar))); 4755 PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4756 PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4757 PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4758 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4759 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 4760 PetscCallCUSPARSE(stat); 4761 #endif 4762 Ccusp->matTranspose = CmatT; 4763 } 4764 } 4765 4766 c->singlemalloc = PETSC_FALSE; 4767 c->free_a = PETSC_TRUE; 4768 c->free_ij = PETSC_TRUE; 4769 PetscCall(PetscMalloc1(m + 1, &c->i)); 4770 PetscCall(PetscMalloc1(c->nz, &c->j)); 4771 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4772 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4773 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4774 ii = *Ccsr->row_offsets; 4775 jj = *Ccsr->column_indices; 4776 PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4777 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4778 } else { 4779 PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4780 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4781 } 4782 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 4783 PetscCall(PetscMalloc1(m, &c->ilen)); 4784 PetscCall(PetscMalloc1(m, &c->imax)); 4785 c->maxnz = c->nz; 4786 c->nonzerorowcnt = 0; 4787 c->rmax = 0; 4788 for (i = 0; i < m; i++) { 4789 const PetscInt nn = c->i[i + 1] - c->i[i]; 4790 c->ilen[i] = c->imax[i] = nn; 4791 c->nonzerorowcnt += (PetscInt) !!nn; 4792 c->rmax = PetscMax(c->rmax, nn); 4793 } 4794 PetscCall(MatMarkDiagonal_SeqAIJ(*C)); 4795 PetscCall(PetscMalloc1(c->nz, &c->a)); 4796 (*C)->nonzerostate++; 4797 PetscCall(PetscLayoutSetUp((*C)->rmap)); 4798 PetscCall(PetscLayoutSetUp((*C)->cmap)); 4799 Ccusp->nonzerostate = (*C)->nonzerostate; 4800 (*C)->preallocated = PETSC_TRUE; 4801 } else { 4802 PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n); 4803 c = (Mat_SeqAIJ *)(*C)->data; 4804 if (c->nz) { 4805 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4806 PetscCheck(Ccusp->cooPerm, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cooPerm"); 4807 PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4808 PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate"); 4809 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4810 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4811 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4812 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4813 Acsr = (CsrMatrix *)Acusp->mat->mat; 4814 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4815 Ccsr = (CsrMatrix *)Ccusp->mat->mat; 4816 PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size()); 4817 PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size()); 4818 PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size()); 4819 PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries); 4820 PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->cooPerm->size(), (PetscInt)Ccsr->values->size()); 4821 auto pmid = Ccusp->cooPerm->begin(); 4822 thrust::advance(pmid, Acsr->num_entries); 4823 PetscCall(PetscLogGpuTimeBegin()); 4824 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->begin()))); 4825 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4826 thrust::for_each(zibait, zieait, VecCUDAEquals()); 4827 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4828 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->end()))); 4829 thrust::for_each(zibbit, ziebit, VecCUDAEquals()); 4830 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE)); 4831 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4832 PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4833 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4834 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4835 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4836 CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat; 4837 auto vT = CcsrT->values->begin(); 4838 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4839 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 4840 (*C)->transupdated = PETSC_TRUE; 4841 } 4842 PetscCall(PetscLogGpuTimeEnd()); 4843 } 4844 } 4845 PetscCall(PetscObjectStateIncrease((PetscObject)*C)); 4846 (*C)->assembled = PETSC_TRUE; 4847 (*C)->was_assembled = PETSC_FALSE; 4848 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4849 PetscFunctionReturn(0); 4850 } 4851 4852 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) { 4853 bool dmem; 4854 const PetscScalar *av; 4855 4856 PetscFunctionBegin; 4857 dmem = isCudaMem(v); 4858 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av)); 4859 if (n && idx) { 4860 THRUSTINTARRAY widx(n); 4861 widx.assign(idx, idx + n); 4862 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 4863 4864 THRUSTARRAY *w = NULL; 4865 thrust::device_ptr<PetscScalar> dv; 4866 if (dmem) { 4867 dv = thrust::device_pointer_cast(v); 4868 } else { 4869 w = new THRUSTARRAY(n); 4870 dv = w->data(); 4871 } 4872 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4873 4874 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv)); 4875 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n)); 4876 thrust::for_each(zibit, zieit, VecCUDAEquals()); 4877 if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 4878 delete w; 4879 } else { 4880 PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost)); 4881 } 4882 if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 4883 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av)); 4884 PetscFunctionReturn(0); 4885 } 4886