1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 7 8 #include <petscconf.h> 9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10 #include <../src/mat/impls/sbaij/seq/sbaij.h> 11 #include <../src/vec/vec/impls/dvecimpl.h> 12 #include <petsc/private/vecimpl.h> 13 #undef VecType 14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15 #include <thrust/adjacent_difference.h> 16 #if PETSC_CPP_VERSION >= 14 17 #define PETSC_HAVE_THRUST_ASYNC 1 18 // thrust::for_each(thrust::cuda::par.on()) requires C++14 19 #include <thrust/async/for_each.h> 20 #endif 21 #include <thrust/iterator/constant_iterator.h> 22 #include <thrust/remove.h> 23 #include <thrust/sort.h> 24 #include <thrust/unique.h> 25 26 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0}; 27 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 28 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 29 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 30 31 typedef enum { 32 CUSPARSE_MV_ALG_DEFAULT = 0, 33 CUSPARSE_COOMV_ALG = 1, 34 CUSPARSE_CSRMV_ALG1 = 2, 35 CUSPARSE_CSRMV_ALG2 = 3 36 } cusparseSpMVAlg_t; 37 38 typedef enum { 39 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 40 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 41 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 42 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 43 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 44 CUSPARSE_SPMM_ALG_DEFAULT = 0, 45 CUSPARSE_SPMM_COO_ALG1 = 1, 46 CUSPARSE_SPMM_COO_ALG2 = 2, 47 CUSPARSE_SPMM_COO_ALG3 = 3, 48 CUSPARSE_SPMM_COO_ALG4 = 5, 49 CUSPARSE_SPMM_CSR_ALG1 = 4, 50 CUSPARSE_SPMM_CSR_ALG2 = 6, 51 } cusparseSpMMAlg_t; 52 53 typedef enum { 54 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 55 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 56 } cusparseCsr2CscAlg_t; 57 */ 58 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0}; 59 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0}; 60 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0}; 61 #endif 62 63 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 64 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 65 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *); 66 67 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *); 68 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *); 69 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *); 70 71 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec); 72 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 73 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 74 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 75 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject); 76 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure); 77 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar); 78 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec); 79 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 80 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 81 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 82 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 83 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 84 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool); 85 86 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **); 87 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **); 88 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat); 89 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **); 90 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **); 91 92 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 93 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool); 94 95 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]); 96 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]); 97 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode); 98 99 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) { 100 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 101 102 PetscFunctionBegin; 103 switch (op) { 104 case MAT_CUSPARSE_MULT: cusparsestruct->format = format; break; 105 case MAT_CUSPARSE_ALL: cusparsestruct->format = format; break; 106 default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op); 107 } 108 PetscFunctionReturn(0); 109 } 110 111 /*@ 112 MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular 113 operation. Only the `MatMult()` operation can use different GPU storage formats 114 115 Not Collective 116 117 Input Parameters: 118 + A - Matrix of type `MATSEQAIJCUSPARSE` 119 . op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`. `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`, 120 `MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`. 121 - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.) 122 123 Output Parameter: 124 125 Level: intermediate 126 127 .seealso: `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 128 @*/ 129 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) { 130 PetscFunctionBegin; 131 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 132 PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format)); 133 PetscFunctionReturn(0); 134 } 135 136 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu) { 137 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 138 139 PetscFunctionBegin; 140 cusparsestruct->use_cpu_solve = use_cpu; 141 PetscFunctionReturn(0); 142 } 143 144 /*@ 145 MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`. 146 147 Input Parameters: 148 + A - Matrix of type `MATSEQAIJCUSPARSE` 149 - use_cpu - set flag for using the built-in CPU `MatSolve()` 150 151 Output Parameter: 152 153 Note: 154 The cuSparse LU solver currently computes the factors with the built-in CPU method 155 and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 156 This method to specify if the solve is done on the CPU or GPU (GPU is the default). 157 158 Level: intermediate 159 160 .seealso: `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 161 @*/ 162 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu) { 163 PetscFunctionBegin; 164 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 165 PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu)); 166 PetscFunctionReturn(0); 167 } 168 169 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg) { 170 PetscFunctionBegin; 171 switch (op) { 172 case MAT_FORM_EXPLICIT_TRANSPOSE: 173 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 174 if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 175 A->form_explicit_transpose = flg; 176 break; 177 default: PetscCall(MatSetOption_SeqAIJ(A, op, flg)); break; 178 } 179 PetscFunctionReturn(0); 180 } 181 182 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 183 184 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) { 185 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 186 IS isrow = b->row, iscol = b->col; 187 PetscBool row_identity, col_identity; 188 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)B->spptr; 189 190 PetscFunctionBegin; 191 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 192 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 193 B->offloadmask = PETSC_OFFLOAD_CPU; 194 /* determine which version of MatSolve needs to be used. */ 195 PetscCall(ISIdentity(isrow, &row_identity)); 196 PetscCall(ISIdentity(iscol, &col_identity)); 197 198 if (!cusparsestruct->use_cpu_solve) { 199 if (row_identity && col_identity) { 200 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 201 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 202 } else { 203 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 204 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 205 } 206 } 207 B->ops->matsolve = NULL; 208 B->ops->matsolvetranspose = NULL; 209 210 /* get the triangular factors */ 211 if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B)); 212 PetscFunctionReturn(0); 213 } 214 215 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject) { 216 MatCUSPARSEStorageFormat format; 217 PetscBool flg; 218 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 219 220 PetscFunctionBegin; 221 PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options"); 222 if (A->factortype == MAT_FACTOR_NONE) { 223 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 224 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format)); 225 226 PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 227 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format)); 228 PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg)); 229 if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve)); 230 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 231 PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg)); 232 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 233 #if CUSPARSE_VERSION > 11301 234 PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 235 #else 236 PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 237 #endif 238 PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg)); 239 PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 240 241 PetscCall( 242 PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg)); 243 PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 244 #endif 245 } 246 PetscOptionsHeadEnd(); 247 PetscFunctionReturn(0); 248 } 249 250 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) { 251 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 252 PetscInt n = A->rmap->n; 253 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 254 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 255 const PetscInt *ai = a->i, *aj = a->j, *vi; 256 const MatScalar *aa = a->a, *v; 257 PetscInt *AiLo, *AjLo; 258 PetscInt i, nz, nzLower, offset, rowOffset; 259 260 PetscFunctionBegin; 261 if (!n) PetscFunctionReturn(0); 262 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 263 try { 264 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 265 nzLower = n + ai[n] - ai[1]; 266 if (!loTriFactor) { 267 PetscScalar *AALo; 268 269 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar))); 270 271 /* Allocate Space for the lower triangular matrix */ 272 PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt))); 273 PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt))); 274 275 /* Fill the lower triangular matrix */ 276 AiLo[0] = (PetscInt)0; 277 AiLo[n] = nzLower; 278 AjLo[0] = (PetscInt)0; 279 AALo[0] = (MatScalar)1.0; 280 v = aa; 281 vi = aj; 282 offset = 1; 283 rowOffset = 1; 284 for (i = 1; i < n; i++) { 285 nz = ai[i + 1] - ai[i]; 286 /* additional 1 for the term on the diagonal */ 287 AiLo[i] = rowOffset; 288 rowOffset += nz + 1; 289 290 PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz)); 291 PetscCall(PetscArraycpy(&(AALo[offset]), v, nz)); 292 293 offset += nz; 294 AjLo[offset] = (PetscInt)i; 295 AALo[offset] = (MatScalar)1.0; 296 offset += 1; 297 298 v += nz; 299 vi += nz; 300 } 301 302 /* allocate space for the triangular factor information */ 303 PetscCall(PetscNew(&loTriFactor)); 304 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 305 /* Create the matrix description */ 306 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 307 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 308 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 309 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 310 #else 311 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 312 #endif 313 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER)); 314 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 315 316 /* set the operation */ 317 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 318 319 /* set the matrix */ 320 loTriFactor->csrMat = new CsrMatrix; 321 loTriFactor->csrMat->num_rows = n; 322 loTriFactor->csrMat->num_cols = n; 323 loTriFactor->csrMat->num_entries = nzLower; 324 325 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 326 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1); 327 328 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 329 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower); 330 331 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 332 loTriFactor->csrMat->values->assign(AALo, AALo + nzLower); 333 334 /* Create the solve analysis information */ 335 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 336 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 337 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 338 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 339 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 340 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 341 #endif 342 343 /* perform the solve analysis */ 344 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 345 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), 346 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 347 loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 348 #else 349 loTriFactor->solveInfo)); 350 #endif 351 PetscCallCUDA(WaitForCUDA()); 352 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 353 354 /* assign the pointer */ 355 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 356 loTriFactor->AA_h = AALo; 357 PetscCallCUDA(cudaFreeHost(AiLo)); 358 PetscCallCUDA(cudaFreeHost(AjLo)); 359 PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar))); 360 } else { /* update values only */ 361 if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar))); 362 /* Fill the lower triangular matrix */ 363 loTriFactor->AA_h[0] = 1.0; 364 v = aa; 365 vi = aj; 366 offset = 1; 367 for (i = 1; i < n; i++) { 368 nz = ai[i + 1] - ai[i]; 369 PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz)); 370 offset += nz; 371 loTriFactor->AA_h[offset] = 1.0; 372 offset += 1; 373 v += nz; 374 } 375 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower); 376 PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar))); 377 } 378 } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); } 379 } 380 PetscFunctionReturn(0); 381 } 382 383 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) { 384 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 385 PetscInt n = A->rmap->n; 386 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 387 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 388 const PetscInt *aj = a->j, *adiag = a->diag, *vi; 389 const MatScalar *aa = a->a, *v; 390 PetscInt *AiUp, *AjUp; 391 PetscInt i, nz, nzUpper, offset; 392 393 PetscFunctionBegin; 394 if (!n) PetscFunctionReturn(0); 395 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 396 try { 397 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 398 nzUpper = adiag[0] - adiag[n]; 399 if (!upTriFactor) { 400 PetscScalar *AAUp; 401 402 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 403 404 /* Allocate Space for the upper triangular matrix */ 405 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 406 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 407 408 /* Fill the upper triangular matrix */ 409 AiUp[0] = (PetscInt)0; 410 AiUp[n] = nzUpper; 411 offset = nzUpper; 412 for (i = n - 1; i >= 0; i--) { 413 v = aa + adiag[i + 1] + 1; 414 vi = aj + adiag[i + 1] + 1; 415 416 /* number of elements NOT on the diagonal */ 417 nz = adiag[i] - adiag[i + 1] - 1; 418 419 /* decrement the offset */ 420 offset -= (nz + 1); 421 422 /* first, set the diagonal elements */ 423 AjUp[offset] = (PetscInt)i; 424 AAUp[offset] = (MatScalar)1. / v[nz]; 425 AiUp[i] = AiUp[i + 1] - (nz + 1); 426 427 PetscCall(PetscArraycpy(&(AjUp[offset + 1]), vi, nz)); 428 PetscCall(PetscArraycpy(&(AAUp[offset + 1]), v, nz)); 429 } 430 431 /* allocate space for the triangular factor information */ 432 PetscCall(PetscNew(&upTriFactor)); 433 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 434 435 /* Create the matrix description */ 436 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 437 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 438 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 439 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 440 #else 441 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 442 #endif 443 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 444 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 445 446 /* set the operation */ 447 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 448 449 /* set the matrix */ 450 upTriFactor->csrMat = new CsrMatrix; 451 upTriFactor->csrMat->num_rows = n; 452 upTriFactor->csrMat->num_cols = n; 453 upTriFactor->csrMat->num_entries = nzUpper; 454 455 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 456 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1); 457 458 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 459 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper); 460 461 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 462 upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper); 463 464 /* Create the solve analysis information */ 465 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 466 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 467 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 468 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 469 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 470 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 471 #endif 472 473 /* perform the solve analysis */ 474 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 475 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), 476 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 477 upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 478 #else 479 upTriFactor->solveInfo)); 480 #endif 481 PetscCallCUDA(WaitForCUDA()); 482 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 483 484 /* assign the pointer */ 485 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 486 upTriFactor->AA_h = AAUp; 487 PetscCallCUDA(cudaFreeHost(AiUp)); 488 PetscCallCUDA(cudaFreeHost(AjUp)); 489 PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar))); 490 } else { 491 if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar))); 492 /* Fill the upper triangular matrix */ 493 offset = nzUpper; 494 for (i = n - 1; i >= 0; i--) { 495 v = aa + adiag[i + 1] + 1; 496 497 /* number of elements NOT on the diagonal */ 498 nz = adiag[i] - adiag[i + 1] - 1; 499 500 /* decrement the offset */ 501 offset -= (nz + 1); 502 503 /* first, set the diagonal elements */ 504 upTriFactor->AA_h[offset] = 1. / v[nz]; 505 PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset + 1]), v, nz)); 506 } 507 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper); 508 PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar))); 509 } 510 } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); } 511 } 512 PetscFunctionReturn(0); 513 } 514 515 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) { 516 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 517 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 518 IS isrow = a->row, iscol = a->icol; 519 PetscBool row_identity, col_identity; 520 PetscInt n = A->rmap->n; 521 522 PetscFunctionBegin; 523 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 524 PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A)); 525 PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A)); 526 527 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 528 cusparseTriFactors->nnz = a->nz; 529 530 A->offloadmask = PETSC_OFFLOAD_BOTH; 531 /* lower triangular indices */ 532 PetscCall(ISIdentity(isrow, &row_identity)); 533 if (!row_identity && !cusparseTriFactors->rpermIndices) { 534 const PetscInt *r; 535 536 PetscCall(ISGetIndices(isrow, &r)); 537 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 538 cusparseTriFactors->rpermIndices->assign(r, r + n); 539 PetscCall(ISRestoreIndices(isrow, &r)); 540 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 541 } 542 543 /* upper triangular indices */ 544 PetscCall(ISIdentity(iscol, &col_identity)); 545 if (!col_identity && !cusparseTriFactors->cpermIndices) { 546 const PetscInt *c; 547 548 PetscCall(ISGetIndices(iscol, &c)); 549 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 550 cusparseTriFactors->cpermIndices->assign(c, c + n); 551 PetscCall(ISRestoreIndices(iscol, &c)); 552 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 553 } 554 PetscFunctionReturn(0); 555 } 556 557 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) { 558 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 559 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 560 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 561 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 562 PetscInt *AiUp, *AjUp; 563 PetscScalar *AAUp; 564 PetscScalar *AALo; 565 PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j; 566 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data; 567 const PetscInt *ai = b->i, *aj = b->j, *vj; 568 const MatScalar *aa = b->a, *v; 569 570 PetscFunctionBegin; 571 if (!n) PetscFunctionReturn(0); 572 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 573 try { 574 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 575 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar))); 576 if (!upTriFactor && !loTriFactor) { 577 /* Allocate Space for the upper triangular matrix */ 578 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 579 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 580 581 /* Fill the upper triangular matrix */ 582 AiUp[0] = (PetscInt)0; 583 AiUp[n] = nzUpper; 584 offset = 0; 585 for (i = 0; i < n; i++) { 586 /* set the pointers */ 587 v = aa + ai[i]; 588 vj = aj + ai[i]; 589 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 590 591 /* first, set the diagonal elements */ 592 AjUp[offset] = (PetscInt)i; 593 AAUp[offset] = (MatScalar)1.0 / v[nz]; 594 AiUp[i] = offset; 595 AALo[offset] = (MatScalar)1.0 / v[nz]; 596 597 offset += 1; 598 if (nz > 0) { 599 PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz)); 600 PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 601 for (j = offset; j < offset + nz; j++) { 602 AAUp[j] = -AAUp[j]; 603 AALo[j] = AAUp[j] / v[nz]; 604 } 605 offset += nz; 606 } 607 } 608 609 /* allocate space for the triangular factor information */ 610 PetscCall(PetscNew(&upTriFactor)); 611 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 612 613 /* Create the matrix description */ 614 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 615 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 616 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 617 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 618 #else 619 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 620 #endif 621 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 622 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 623 624 /* set the matrix */ 625 upTriFactor->csrMat = new CsrMatrix; 626 upTriFactor->csrMat->num_rows = A->rmap->n; 627 upTriFactor->csrMat->num_cols = A->cmap->n; 628 upTriFactor->csrMat->num_entries = a->nz; 629 630 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 631 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 632 633 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 634 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 635 636 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 637 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 638 639 /* set the operation */ 640 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 641 642 /* Create the solve analysis information */ 643 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 644 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 645 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 646 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 647 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 648 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 649 #endif 650 651 /* perform the solve analysis */ 652 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 653 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), 654 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 655 upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 656 #else 657 upTriFactor->solveInfo)); 658 #endif 659 PetscCallCUDA(WaitForCUDA()); 660 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 661 662 /* assign the pointer */ 663 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 664 665 /* allocate space for the triangular factor information */ 666 PetscCall(PetscNew(&loTriFactor)); 667 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 668 669 /* Create the matrix description */ 670 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 671 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 672 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 673 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 674 #else 675 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 676 #endif 677 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 678 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 679 680 /* set the operation */ 681 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 682 683 /* set the matrix */ 684 loTriFactor->csrMat = new CsrMatrix; 685 loTriFactor->csrMat->num_rows = A->rmap->n; 686 loTriFactor->csrMat->num_cols = A->cmap->n; 687 loTriFactor->csrMat->num_entries = a->nz; 688 689 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 690 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 691 692 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 693 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 694 695 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 696 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 697 698 /* Create the solve analysis information */ 699 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 700 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 701 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 702 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 703 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 704 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 705 #endif 706 707 /* perform the solve analysis */ 708 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 709 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), 710 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 711 loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 712 #else 713 loTriFactor->solveInfo)); 714 #endif 715 PetscCallCUDA(WaitForCUDA()); 716 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 717 718 /* assign the pointer */ 719 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 720 721 PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar)))); 722 PetscCallCUDA(cudaFreeHost(AiUp)); 723 PetscCallCUDA(cudaFreeHost(AjUp)); 724 } else { 725 /* Fill the upper triangular matrix */ 726 offset = 0; 727 for (i = 0; i < n; i++) { 728 /* set the pointers */ 729 v = aa + ai[i]; 730 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 731 732 /* first, set the diagonal elements */ 733 AAUp[offset] = 1.0 / v[nz]; 734 AALo[offset] = 1.0 / v[nz]; 735 736 offset += 1; 737 if (nz > 0) { 738 PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 739 for (j = offset; j < offset + nz; j++) { 740 AAUp[j] = -AAUp[j]; 741 AALo[j] = AAUp[j] / v[nz]; 742 } 743 offset += nz; 744 } 745 } 746 PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 747 PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 748 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 749 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 750 PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar))); 751 } 752 PetscCallCUDA(cudaFreeHost(AAUp)); 753 PetscCallCUDA(cudaFreeHost(AALo)); 754 } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); } 755 } 756 PetscFunctionReturn(0); 757 } 758 759 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) { 760 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 761 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 762 IS ip = a->row; 763 PetscBool perm_identity; 764 PetscInt n = A->rmap->n; 765 766 PetscFunctionBegin; 767 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 768 PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A)); 769 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 770 cusparseTriFactors->nnz = (a->nz - n) * 2 + n; 771 772 A->offloadmask = PETSC_OFFLOAD_BOTH; 773 774 /* lower triangular indices */ 775 PetscCall(ISIdentity(ip, &perm_identity)); 776 if (!perm_identity) { 777 IS iip; 778 const PetscInt *irip, *rip; 779 780 PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip)); 781 PetscCall(ISGetIndices(iip, &irip)); 782 PetscCall(ISGetIndices(ip, &rip)); 783 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 784 cusparseTriFactors->rpermIndices->assign(rip, rip + n); 785 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 786 cusparseTriFactors->cpermIndices->assign(irip, irip + n); 787 PetscCall(ISRestoreIndices(iip, &irip)); 788 PetscCall(ISDestroy(&iip)); 789 PetscCall(ISRestoreIndices(ip, &rip)); 790 PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt))); 791 } 792 PetscFunctionReturn(0); 793 } 794 795 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) { 796 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 797 IS ip = b->row; 798 PetscBool perm_identity; 799 800 PetscFunctionBegin; 801 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 802 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 803 B->offloadmask = PETSC_OFFLOAD_CPU; 804 /* determine which version of MatSolve needs to be used. */ 805 PetscCall(ISIdentity(ip, &perm_identity)); 806 if (perm_identity) { 807 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 808 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 809 B->ops->matsolve = NULL; 810 B->ops->matsolvetranspose = NULL; 811 } else { 812 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 813 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 814 B->ops->matsolve = NULL; 815 B->ops->matsolvetranspose = NULL; 816 } 817 818 /* get the triangular factors */ 819 PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B)); 820 PetscFunctionReturn(0); 821 } 822 823 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) { 824 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 825 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 826 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 827 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 828 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 829 cusparseIndexBase_t indexBase; 830 cusparseMatrixType_t matrixType; 831 cusparseFillMode_t fillMode; 832 cusparseDiagType_t diagType; 833 834 PetscFunctionBegin; 835 /* allocate space for the transpose of the lower triangular factor */ 836 PetscCall(PetscNew(&loTriFactorT)); 837 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 838 839 /* set the matrix descriptors of the lower triangular factor */ 840 matrixType = cusparseGetMatType(loTriFactor->descr); 841 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 842 fillMode = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 843 diagType = cusparseGetMatDiagType(loTriFactor->descr); 844 845 /* Create the matrix description */ 846 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr)); 847 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 848 PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType)); 849 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode)); 850 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType)); 851 852 /* set the operation */ 853 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 854 855 /* allocate GPU space for the CSC of the lower triangular factor*/ 856 loTriFactorT->csrMat = new CsrMatrix; 857 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 858 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 859 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 860 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1); 861 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 862 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 863 864 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 865 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 866 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), 867 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 868 loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 869 PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize)); 870 #endif 871 872 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 873 PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 874 loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), 875 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 876 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer)); 877 #else 878 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase)); 879 #endif 880 PetscCallCUDA(WaitForCUDA()); 881 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 882 883 /* Create the solve analysis information */ 884 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 885 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo)); 886 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 887 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 888 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize)); 889 PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize)); 890 #endif 891 892 /* perform the solve analysis */ 893 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 894 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 895 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 896 loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 897 #else 898 loTriFactorT->solveInfo)); 899 #endif 900 PetscCallCUDA(WaitForCUDA()); 901 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 902 903 /* assign the pointer */ 904 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 905 906 /*********************************************/ 907 /* Now the Transpose of the Upper Tri Factor */ 908 /*********************************************/ 909 910 /* allocate space for the transpose of the upper triangular factor */ 911 PetscCall(PetscNew(&upTriFactorT)); 912 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 913 914 /* set the matrix descriptors of the upper triangular factor */ 915 matrixType = cusparseGetMatType(upTriFactor->descr); 916 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 917 fillMode = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 918 diagType = cusparseGetMatDiagType(upTriFactor->descr); 919 920 /* Create the matrix description */ 921 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr)); 922 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 923 PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType)); 924 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode)); 925 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType)); 926 927 /* set the operation */ 928 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 929 930 /* allocate GPU space for the CSC of the upper triangular factor*/ 931 upTriFactorT->csrMat = new CsrMatrix; 932 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 933 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 934 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 935 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1); 936 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 937 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 938 939 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 940 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 941 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), 942 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 943 upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 944 PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize)); 945 #endif 946 947 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 948 PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 949 upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), 950 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 951 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer)); 952 #else 953 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase)); 954 #endif 955 956 PetscCallCUDA(WaitForCUDA()); 957 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 958 959 /* Create the solve analysis information */ 960 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 961 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo)); 962 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 963 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 964 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize)); 965 PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize)); 966 #endif 967 968 /* perform the solve analysis */ 969 /* christ, would it have killed you to put this stuff in a function????????? */ 970 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 971 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 972 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 973 upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 974 #else 975 upTriFactorT->solveInfo)); 976 #endif 977 978 PetscCallCUDA(WaitForCUDA()); 979 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 980 981 /* assign the pointer */ 982 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 983 PetscFunctionReturn(0); 984 } 985 986 struct PetscScalarToPetscInt { 987 __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); } 988 }; 989 990 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) { 991 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 992 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 993 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 994 cusparseStatus_t stat; 995 cusparseIndexBase_t indexBase; 996 997 PetscFunctionBegin; 998 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 999 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 1000 PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct"); 1001 matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 1002 PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct"); 1003 if (A->transupdated) PetscFunctionReturn(0); 1004 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1005 PetscCall(PetscLogGpuTimeBegin()); 1006 if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 1007 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1008 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1009 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr)); 1010 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1011 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase)); 1012 PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1013 1014 /* set alpha and beta */ 1015 PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar))); 1016 PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero), sizeof(PetscScalar))); 1017 PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar))); 1018 PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1019 PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1020 PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1021 1022 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1023 CsrMatrix *matrixT = new CsrMatrix; 1024 matstructT->mat = matrixT; 1025 matrixT->num_rows = A->cmap->n; 1026 matrixT->num_cols = A->rmap->n; 1027 matrixT->num_entries = a->nz; 1028 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1); 1029 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1030 matrixT->values = new THRUSTARRAY(a->nz); 1031 1032 if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1033 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1034 1035 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1036 #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1) 1037 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1038 indexBase, cusparse_scalartype); 1039 PetscCallCUSPARSE(stat); 1040 #else 1041 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 1042 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 1043 1044 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 1045 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 1046 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 1047 */ 1048 if (matrixT->num_entries) { 1049 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype); 1050 PetscCallCUSPARSE(stat); 1051 1052 } else { 1053 matstructT->matDescr = NULL; 1054 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 1055 } 1056 #endif 1057 #endif 1058 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1059 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1060 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1061 #else 1062 CsrMatrix *temp = new CsrMatrix; 1063 CsrMatrix *tempT = new CsrMatrix; 1064 /* First convert HYB to CSR */ 1065 temp->num_rows = A->rmap->n; 1066 temp->num_cols = A->cmap->n; 1067 temp->num_entries = a->nz; 1068 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1069 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1070 temp->values = new THRUSTARRAY(a->nz); 1071 1072 stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get()); 1073 PetscCallCUSPARSE(stat); 1074 1075 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1076 tempT->num_rows = A->rmap->n; 1077 tempT->num_cols = A->cmap->n; 1078 tempT->num_entries = a->nz; 1079 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1080 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1081 tempT->values = new THRUSTARRAY(a->nz); 1082 1083 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(), 1084 tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1085 PetscCallCUSPARSE(stat); 1086 1087 /* Last, convert CSC to HYB */ 1088 cusparseHybMat_t hybMat; 1089 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 1090 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1091 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition); 1092 PetscCallCUSPARSE(stat); 1093 1094 /* assign the pointer */ 1095 matstructT->mat = hybMat; 1096 A->transupdated = PETSC_TRUE; 1097 /* delete temporaries */ 1098 if (tempT) { 1099 if (tempT->values) delete (THRUSTARRAY *)tempT->values; 1100 if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices; 1101 if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets; 1102 delete (CsrMatrix *)tempT; 1103 } 1104 if (temp) { 1105 if (temp->values) delete (THRUSTARRAY *)temp->values; 1106 if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices; 1107 if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets; 1108 delete (CsrMatrix *)temp; 1109 } 1110 #endif 1111 } 1112 } 1113 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1114 CsrMatrix *matrix = (CsrMatrix *)matstruct->mat; 1115 CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat; 1116 PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix"); 1117 PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows"); 1118 PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols"); 1119 PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values"); 1120 PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT"); 1121 PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows"); 1122 PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols"); 1123 PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values"); 1124 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1125 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1126 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1127 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 1128 } 1129 if (!cusparsestruct->csr2csc_i) { 1130 THRUSTARRAY csr2csc_a(matrix->num_entries); 1131 PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1132 1133 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1134 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1135 void *csr2cscBuffer; 1136 size_t csr2cscBufferSize; 1137 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1138 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize); 1139 PetscCallCUSPARSE(stat); 1140 PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize)); 1141 #endif 1142 1143 if (matrix->num_entries) { 1144 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1145 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1146 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1147 1148 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1149 should be filled with indexBase. So I just take a shortcut here. 1150 */ 1151 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1152 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1153 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer); 1154 PetscCallCUSPARSE(stat); 1155 #else 1156 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1157 PetscCallCUSPARSE(stat); 1158 #endif 1159 } else { 1160 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 1161 } 1162 1163 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1164 PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt())); 1165 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1166 PetscCallCUDA(cudaFree(csr2cscBuffer)); 1167 #endif 1168 } 1169 PetscCallThrust( 1170 thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin())); 1171 } 1172 PetscCall(PetscLogGpuTimeEnd()); 1173 PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1174 /* the compressed row indices is not used for matTranspose */ 1175 matstructT->cprowIndices = NULL; 1176 /* assign the pointer */ 1177 ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT; 1178 A->transupdated = PETSC_TRUE; 1179 PetscFunctionReturn(0); 1180 } 1181 1182 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1183 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) { 1184 PetscInt n = xx->map->n; 1185 const PetscScalar *barray; 1186 PetscScalar *xarray; 1187 thrust::device_ptr<const PetscScalar> bGPU; 1188 thrust::device_ptr<PetscScalar> xGPU; 1189 cusparseStatus_t stat; 1190 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1191 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1192 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1193 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1194 1195 PetscFunctionBegin; 1196 /* Analyze the matrix and create the transpose ... on the fly */ 1197 if (!loTriFactorT && !upTriFactorT) { 1198 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1199 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1200 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1201 } 1202 1203 /* Get the GPU pointers */ 1204 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1205 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1206 xGPU = thrust::device_pointer_cast(xarray); 1207 bGPU = thrust::device_pointer_cast(barray); 1208 1209 PetscCall(PetscLogGpuTimeBegin()); 1210 /* First, reorder with the row permutation */ 1211 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU); 1212 1213 /* First, solve U */ 1214 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, 1215 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1216 upTriFactorT->csrMat->num_entries, 1217 #endif 1218 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, 1219 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1220 tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer); 1221 PetscCallCUSPARSE(stat); 1222 #else 1223 tempGPU->data().get()); 1224 PetscCallCUSPARSE(stat); 1225 #endif 1226 1227 /* Then, solve L */ 1228 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, 1229 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1230 loTriFactorT->csrMat->num_entries, 1231 #endif 1232 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1233 tempGPU->data().get(), 1234 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1235 xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer); 1236 PetscCallCUSPARSE(stat); 1237 #else 1238 xarray); 1239 PetscCallCUSPARSE(stat); 1240 #endif 1241 1242 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1243 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin()); 1244 1245 /* Copy the temporary to the full solution. */ 1246 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU); 1247 1248 /* restore */ 1249 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1250 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1251 PetscCall(PetscLogGpuTimeEnd()); 1252 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1253 PetscFunctionReturn(0); 1254 } 1255 1256 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) { 1257 const PetscScalar *barray; 1258 PetscScalar *xarray; 1259 cusparseStatus_t stat; 1260 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1261 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1262 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1263 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1264 1265 PetscFunctionBegin; 1266 /* Analyze the matrix and create the transpose ... on the fly */ 1267 if (!loTriFactorT && !upTriFactorT) { 1268 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1269 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1270 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1271 } 1272 1273 /* Get the GPU pointers */ 1274 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1275 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1276 1277 PetscCall(PetscLogGpuTimeBegin()); 1278 /* First, solve U */ 1279 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, 1280 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1281 upTriFactorT->csrMat->num_entries, 1282 #endif 1283 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, 1284 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1285 tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer); 1286 PetscCallCUSPARSE(stat); 1287 #else 1288 tempGPU->data().get()); 1289 PetscCallCUSPARSE(stat); 1290 #endif 1291 1292 /* Then, solve L */ 1293 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, 1294 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1295 loTriFactorT->csrMat->num_entries, 1296 #endif 1297 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1298 tempGPU->data().get(), 1299 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1300 xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer); 1301 PetscCallCUSPARSE(stat); 1302 #else 1303 xarray); 1304 PetscCallCUSPARSE(stat); 1305 #endif 1306 1307 /* restore */ 1308 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1309 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1310 PetscCall(PetscLogGpuTimeEnd()); 1311 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1312 PetscFunctionReturn(0); 1313 } 1314 1315 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) { 1316 const PetscScalar *barray; 1317 PetscScalar *xarray; 1318 thrust::device_ptr<const PetscScalar> bGPU; 1319 thrust::device_ptr<PetscScalar> xGPU; 1320 cusparseStatus_t stat; 1321 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1322 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1323 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1324 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1325 1326 PetscFunctionBegin; 1327 1328 /* Get the GPU pointers */ 1329 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1330 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1331 xGPU = thrust::device_pointer_cast(xarray); 1332 bGPU = thrust::device_pointer_cast(barray); 1333 1334 PetscCall(PetscLogGpuTimeBegin()); 1335 /* First, reorder with the row permutation */ 1336 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin()); 1337 1338 /* Next, solve L */ 1339 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, 1340 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1341 loTriFactor->csrMat->num_entries, 1342 #endif 1343 &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 1344 tempGPU->data().get(), 1345 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1346 xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer); 1347 PetscCallCUSPARSE(stat); 1348 #else 1349 xarray); 1350 PetscCallCUSPARSE(stat); 1351 #endif 1352 1353 /* Then, solve U */ 1354 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, 1355 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1356 upTriFactor->csrMat->num_entries, 1357 #endif 1358 &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, 1359 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1360 tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer); 1361 PetscCallCUSPARSE(stat); 1362 #else 1363 tempGPU->data().get()); 1364 PetscCallCUSPARSE(stat); 1365 #endif 1366 1367 /* Last, reorder with the column permutation */ 1368 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU); 1369 1370 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1371 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1372 PetscCall(PetscLogGpuTimeEnd()); 1373 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1374 PetscFunctionReturn(0); 1375 } 1376 1377 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) { 1378 const PetscScalar *barray; 1379 PetscScalar *xarray; 1380 cusparseStatus_t stat; 1381 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1382 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1383 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1384 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1385 1386 PetscFunctionBegin; 1387 /* Get the GPU pointers */ 1388 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1389 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1390 1391 PetscCall(PetscLogGpuTimeBegin()); 1392 /* First, solve L */ 1393 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, 1394 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1395 loTriFactor->csrMat->num_entries, 1396 #endif 1397 &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, 1398 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1399 tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer); 1400 PetscCallCUSPARSE(stat); 1401 #else 1402 tempGPU->data().get()); 1403 PetscCallCUSPARSE(stat); 1404 #endif 1405 1406 /* Next, solve U */ 1407 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, 1408 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1409 upTriFactor->csrMat->num_entries, 1410 #endif 1411 &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 1412 tempGPU->data().get(), 1413 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1414 xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer); 1415 PetscCallCUSPARSE(stat); 1416 #else 1417 xarray); 1418 PetscCallCUSPARSE(stat); 1419 #endif 1420 1421 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1422 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1423 PetscCall(PetscLogGpuTimeEnd()); 1424 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1425 PetscFunctionReturn(0); 1426 } 1427 1428 #if CUSPARSE_VERSION >= 11500 1429 /* cusparseSpSV_solve() and friends first appeared in cusparse-11.3 */ 1430 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x) { 1431 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1432 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1433 const PetscScalar *barray; 1434 PetscScalar *xarray; 1435 1436 PetscFunctionBegin; 1437 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1438 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1439 PetscCall(PetscLogGpuTimeBegin()); 1440 1441 /* Solve L*y = b */ 1442 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1443 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1444 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */ 1445 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, 1446 fs->spsvDescr_L)); // cusparseSpSV_solve() scretely uses the external buffer used in cusparseSpSV_analysis()! 1447 1448 /* Solve U*x = y */ 1449 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1450 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */ 1451 fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U)); 1452 1453 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1454 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1455 1456 PetscCall(PetscLogGpuTimeEnd()); 1457 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1458 PetscFunctionReturn(0); 1459 } 1460 1461 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x) { 1462 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1463 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1464 const PetscScalar *barray; 1465 PetscScalar *xarray; 1466 1467 PetscFunctionBegin; 1468 if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */ 1469 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 1470 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do tranpose solve with it */ 1471 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 1472 1473 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); 1474 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 1475 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 1476 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 1477 fs->createdTransposeSpSVDescr = PETSC_TRUE; 1478 } 1479 1480 if (!fs->updatedTransposeSpSVAnalysis) { 1481 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1482 1483 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 1484 fs->updatedTransposeSpSVAnalysis = PETSC_TRUE; 1485 } 1486 1487 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1488 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1489 PetscCall(PetscLogGpuTimeBegin()); 1490 1491 /* Solve Ut*y = b */ 1492 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1493 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1494 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */ 1495 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut)); 1496 1497 /* Solve Lt*x = y */ 1498 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1499 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */ 1500 fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt)); 1501 1502 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1503 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1504 PetscCall(PetscLogGpuTimeEnd()); 1505 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1506 PetscFunctionReturn(0); 1507 } 1508 1509 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *info) { 1510 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1511 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1512 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1513 CsrMatrix *Acsr; 1514 PetscInt m, nz; 1515 PetscBool flg; 1516 1517 PetscFunctionBegin; 1518 if (PetscDefined(USE_DEBUG)) { 1519 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1520 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1521 } 1522 1523 /* Copy A's value to fact */ 1524 m = fact->rmap->n; 1525 nz = aij->nz; 1526 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1527 Acsr = (CsrMatrix *)Acusp->mat->mat; 1528 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1529 1530 /* Factorize fact inplace */ 1531 if (m) 1532 PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1533 fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1534 if (PetscDefined(USE_DEBUG)) { 1535 int numerical_zero; 1536 cusparseStatus_t status; 1537 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero); 1538 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1539 } 1540 1541 /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02() 1542 See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78 1543 */ 1544 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1545 1546 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 1547 1548 /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */ 1549 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 1550 1551 fact->offloadmask = PETSC_OFFLOAD_GPU; 1552 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ILU0; 1553 fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_ILU0; 1554 fact->ops->matsolve = NULL; 1555 fact->ops->matsolvetranspose = NULL; 1556 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1557 PetscFunctionReturn(0); 1558 } 1559 1560 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) { 1561 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1562 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1563 PetscInt m, nz; 1564 1565 PetscFunctionBegin; 1566 if (PetscDefined(USE_DEBUG)) { 1567 PetscInt i; 1568 PetscBool flg, missing; 1569 1570 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1571 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1572 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 1573 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1574 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1575 } 1576 1577 /* Free the old stale stuff */ 1578 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 1579 1580 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1581 but they will not be used. Allocate them just for easy debugging. 1582 */ 1583 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 1584 1585 fact->offloadmask = PETSC_OFFLOAD_BOTH; 1586 fact->factortype = MAT_FACTOR_ILU; 1587 fact->info.factor_mallocs = 0; 1588 fact->info.fill_ratio_given = info->fill; 1589 fact->info.fill_ratio_needed = 1.0; 1590 1591 aij->row = NULL; 1592 aij->col = NULL; 1593 1594 /* ====================================================================== */ 1595 /* Copy A's i, j to fact and also allocate the value array of fact. */ 1596 /* We'll do in-place factorization on fact */ 1597 /* ====================================================================== */ 1598 const int *Ai, *Aj; 1599 1600 m = fact->rmap->n; 1601 nz = aij->nz; 1602 1603 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1))); 1604 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz)); 1605 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz)); 1606 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */ 1607 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1608 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1609 1610 /* ====================================================================== */ 1611 /* Create descriptors for M, L, U */ 1612 /* ====================================================================== */ 1613 cusparseFillMode_t fillMode; 1614 cusparseDiagType_t diagType; 1615 1616 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 1617 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 1618 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 1619 1620 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 1621 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1622 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1623 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1624 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1625 */ 1626 fillMode = CUSPARSE_FILL_MODE_LOWER; 1627 diagType = CUSPARSE_DIAG_TYPE_UNIT; 1628 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1629 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1630 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1631 1632 fillMode = CUSPARSE_FILL_MODE_UPPER; 1633 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 1634 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1635 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1636 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1637 1638 /* ========================================================================= */ 1639 /* Query buffer sizes for csrilu0, SpSV and allocate buffers */ 1640 /* ========================================================================= */ 1641 PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M)); 1642 if (m) 1643 PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1644 fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, &fs->factBufferSize_M)); 1645 1646 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 1647 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 1648 1649 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 1650 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 1651 1652 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 1653 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 1654 1655 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 1656 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 1657 1658 /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab, 1659 and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77, 1660 spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U. 1661 To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U. 1662 */ 1663 if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) { 1664 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 1665 fs->spsvBuffer_L = fs->factBuffer_M; 1666 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 1667 } else { 1668 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M))); 1669 fs->spsvBuffer_U = fs->factBuffer_M; 1670 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 1671 } 1672 1673 /* ========================================================================== */ 1674 /* Perform analysis of ilu0 on M, SpSv on L and U */ 1675 /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/ 1676 /* ========================================================================== */ 1677 int structural_zero; 1678 cusparseStatus_t status; 1679 1680 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1681 if (m) 1682 PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1683 fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1684 if (PetscDefined(USE_DEBUG)) { 1685 /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 1686 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero); 1687 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero); 1688 } 1689 1690 /* Estimate FLOPs of the numeric factorization */ 1691 { 1692 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 1693 PetscInt *Ai, *Adiag, nzRow, nzLeft; 1694 PetscLogDouble flops = 0.0; 1695 1696 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 1697 Ai = Aseq->i; 1698 Adiag = Aseq->diag; 1699 for (PetscInt i = 0; i < m; i++) { 1700 if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */ 1701 nzRow = Ai[i + 1] - Ai[i]; 1702 nzLeft = Adiag[i] - Ai[i]; 1703 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 1704 and include the eliminated one will be updated, which incurs a multiplication and an addition. 1705 */ 1706 nzLeft = (nzRow - 1) / 2; 1707 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 1708 } 1709 } 1710 fs->numericFactFlops = flops; 1711 } 1712 fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0; 1713 PetscFunctionReturn(0); 1714 } 1715 1716 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x) { 1717 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1718 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1719 const PetscScalar *barray; 1720 PetscScalar *xarray; 1721 1722 PetscFunctionBegin; 1723 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1724 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1725 PetscCall(PetscLogGpuTimeBegin()); 1726 1727 /* Solve L*y = b */ 1728 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1729 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1730 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */ 1731 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L)); 1732 1733 /* Solve Lt*x = y */ 1734 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1735 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */ 1736 fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt)); 1737 1738 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1739 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1740 1741 PetscCall(PetscLogGpuTimeEnd()); 1742 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1743 PetscFunctionReturn(0); 1744 } 1745 1746 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *info) { 1747 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1748 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1749 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1750 CsrMatrix *Acsr; 1751 PetscInt m, nz; 1752 PetscBool flg; 1753 1754 PetscFunctionBegin; 1755 if (PetscDefined(USE_DEBUG)) { 1756 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1757 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1758 } 1759 1760 /* Copy A's value to fact */ 1761 m = fact->rmap->n; 1762 nz = aij->nz; 1763 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1764 Acsr = (CsrMatrix *)Acusp->mat->mat; 1765 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1766 1767 /* Factorize fact inplace */ 1768 /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve 1769 Function csric02() only takes the lower triangular part of matrix A to perform factorization. 1770 The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored, 1771 and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not. 1772 In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided. 1773 */ 1774 if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 1775 if (PetscDefined(USE_DEBUG)) { 1776 int numerical_zero; 1777 cusparseStatus_t status; 1778 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero); 1779 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1780 } 1781 1782 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1783 1784 /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE 1785 ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F 1786 */ 1787 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1788 1789 fact->offloadmask = PETSC_OFFLOAD_GPU; 1790 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0; 1791 fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0; 1792 fact->ops->matsolve = NULL; 1793 fact->ops->matsolvetranspose = NULL; 1794 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1795 PetscFunctionReturn(0); 1796 } 1797 1798 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS perm, const MatFactorInfo *info) { 1799 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1800 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1801 PetscInt m, nz; 1802 1803 PetscFunctionBegin; 1804 if (PetscDefined(USE_DEBUG)) { 1805 PetscInt i; 1806 PetscBool flg, missing; 1807 1808 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1809 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1810 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 1811 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1812 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1813 } 1814 1815 /* Free the old stale stuff */ 1816 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 1817 1818 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1819 but they will not be used. Allocate them just for easy debugging. 1820 */ 1821 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 1822 1823 fact->offloadmask = PETSC_OFFLOAD_BOTH; 1824 fact->factortype = MAT_FACTOR_ICC; 1825 fact->info.factor_mallocs = 0; 1826 fact->info.fill_ratio_given = info->fill; 1827 fact->info.fill_ratio_needed = 1.0; 1828 1829 aij->row = NULL; 1830 aij->col = NULL; 1831 1832 /* ====================================================================== */ 1833 /* Copy A's i, j to fact and also allocate the value array of fact. */ 1834 /* We'll do in-place factorization on fact */ 1835 /* ====================================================================== */ 1836 const int *Ai, *Aj; 1837 1838 m = fact->rmap->n; 1839 nz = aij->nz; 1840 1841 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1))); 1842 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz)); 1843 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz)); 1844 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */ 1845 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1846 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1847 1848 /* ====================================================================== */ 1849 /* Create mat descriptors for M, L */ 1850 /* ====================================================================== */ 1851 cusparseFillMode_t fillMode; 1852 cusparseDiagType_t diagType; 1853 1854 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 1855 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 1856 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 1857 1858 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 1859 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1860 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1861 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1862 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1863 */ 1864 fillMode = CUSPARSE_FILL_MODE_LOWER; 1865 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 1866 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1867 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1868 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1869 1870 /* ========================================================================= */ 1871 /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */ 1872 /* ========================================================================= */ 1873 PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M)); 1874 if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, &fs->factBufferSize_M)); 1875 1876 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 1877 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 1878 1879 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 1880 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 1881 1882 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 1883 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 1884 1885 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 1886 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 1887 1888 /* To save device memory, we make the factorization buffer share with one of the solver buffer. 1889 See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(). 1890 */ 1891 if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) { 1892 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 1893 fs->spsvBuffer_L = fs->factBuffer_M; 1894 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 1895 } else { 1896 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M))); 1897 fs->spsvBuffer_Lt = fs->factBuffer_M; 1898 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 1899 } 1900 1901 /* ========================================================================== */ 1902 /* Perform analysis of ic0 on M */ 1903 /* The lower triangular part of M has the same sparsity pattern as L */ 1904 /* ========================================================================== */ 1905 int structural_zero; 1906 cusparseStatus_t status; 1907 1908 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1909 if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 1910 if (PetscDefined(USE_DEBUG)) { 1911 /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 1912 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero); 1913 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero); 1914 } 1915 1916 /* Estimate FLOPs of the numeric factorization */ 1917 { 1918 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 1919 PetscInt *Ai, nzRow, nzLeft; 1920 PetscLogDouble flops = 0.0; 1921 1922 Ai = Aseq->i; 1923 for (PetscInt i = 0; i < m; i++) { 1924 nzRow = Ai[i + 1] - Ai[i]; 1925 if (nzRow > 1) { 1926 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 1927 and include the eliminated one will be updated, which incurs a multiplication and an addition. 1928 */ 1929 nzLeft = (nzRow - 1) / 2; 1930 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 1931 } 1932 } 1933 fs->numericFactFlops = flops; 1934 } 1935 fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0; 1936 PetscFunctionReturn(0); 1937 } 1938 #endif 1939 1940 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) { 1941 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 1942 1943 PetscFunctionBegin; 1944 #if CUSPARSE_VERSION >= 11500 1945 PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 1946 if (cusparseTriFactors->factorizeOnDevice) { 1947 PetscCall(ISIdentity(isrow, &row_identity)); 1948 PetscCall(ISIdentity(iscol, &col_identity)); 1949 } 1950 if (!info->levels && row_identity && col_identity) { 1951 PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info)); 1952 } else 1953 #endif 1954 { 1955 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 1956 PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1957 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 1958 } 1959 PetscFunctionReturn(0); 1960 } 1961 1962 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) { 1963 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 1964 1965 PetscFunctionBegin; 1966 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 1967 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1968 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 1969 PetscFunctionReturn(0); 1970 } 1971 1972 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) { 1973 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 1974 1975 PetscFunctionBegin; 1976 #if CUSPARSE_VERSION >= 11500 1977 PetscBool perm_identity = PETSC_FALSE; 1978 if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity)); 1979 if (!info->levels && perm_identity) { 1980 PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info)); 1981 } else 1982 #endif 1983 { 1984 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 1985 PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 1986 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 1987 } 1988 PetscFunctionReturn(0); 1989 } 1990 1991 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) { 1992 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 1993 1994 PetscFunctionBegin; 1995 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 1996 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); 1997 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 1998 PetscFunctionReturn(0); 1999 } 2000 2001 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A, MatSolverType *type) { 2002 PetscFunctionBegin; 2003 *type = MATSOLVERCUSPARSE; 2004 PetscFunctionReturn(0); 2005 } 2006 2007 /*MC 2008 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 2009 on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported 2010 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 2011 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 2012 CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 2013 algorithms are not recommended. This class does NOT support direct solver operations. 2014 2015 Level: beginner 2016 2017 .seealso: `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 2018 M*/ 2019 2020 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B) { 2021 PetscInt n = A->rmap->n; 2022 PetscBool factOnDevice, factOnHost; 2023 char *prefix; 2024 char factPlace[32] = "device"; /* the default */ 2025 2026 PetscFunctionBegin; 2027 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2028 PetscCall(MatSetSizes(*B, n, n, n, n)); 2029 (*B)->factortype = ftype; 2030 PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE)); 2031 2032 prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix; 2033 PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)), prefix, "MatGetFactor", "Mat"); 2034 PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL)); 2035 PetscOptionsEnd(); 2036 PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice)); 2037 PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost)); 2038 PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)(*B)), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace); 2039 ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice; 2040 2041 if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE)); 2042 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2043 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2044 if (!A->boundtocpu) { 2045 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 2046 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 2047 } else { 2048 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 2049 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 2050 } 2051 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 2052 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2053 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2054 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2055 if (!A->boundtocpu) { 2056 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 2057 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 2058 } else { 2059 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 2060 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 2061 } 2062 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2063 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2064 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types"); 2065 2066 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 2067 (*B)->canuseordering = PETSC_TRUE; 2068 PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse)); 2069 PetscFunctionReturn(0); 2070 } 2071 2072 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) { 2073 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2074 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2075 #if CUSPARSE_VERSION >= 13500 2076 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 2077 #endif 2078 2079 PetscFunctionBegin; 2080 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 2081 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2082 if (A->factortype == MAT_FACTOR_NONE) { 2083 CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat; 2084 PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2085 } 2086 #if CUSPARSE_VERSION >= 13500 2087 else if (fs->csrVal) { 2088 /* We have a factorized matrix on device and are able to copy it to host */ 2089 PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2090 } 2091 #endif 2092 else 2093 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host"); 2094 PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar))); 2095 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2096 A->offloadmask = PETSC_OFFLOAD_BOTH; 2097 } 2098 PetscFunctionReturn(0); 2099 } 2100 2101 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) { 2102 PetscFunctionBegin; 2103 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2104 *array = ((Mat_SeqAIJ *)A->data)->a; 2105 PetscFunctionReturn(0); 2106 } 2107 2108 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) { 2109 PetscFunctionBegin; 2110 A->offloadmask = PETSC_OFFLOAD_CPU; 2111 *array = NULL; 2112 PetscFunctionReturn(0); 2113 } 2114 2115 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) { 2116 PetscFunctionBegin; 2117 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2118 *array = ((Mat_SeqAIJ *)A->data)->a; 2119 PetscFunctionReturn(0); 2120 } 2121 2122 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) { 2123 PetscFunctionBegin; 2124 *array = NULL; 2125 PetscFunctionReturn(0); 2126 } 2127 2128 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) { 2129 PetscFunctionBegin; 2130 *array = ((Mat_SeqAIJ *)A->data)->a; 2131 PetscFunctionReturn(0); 2132 } 2133 2134 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) { 2135 PetscFunctionBegin; 2136 A->offloadmask = PETSC_OFFLOAD_CPU; 2137 *array = NULL; 2138 PetscFunctionReturn(0); 2139 } 2140 2141 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) { 2142 Mat_SeqAIJCUSPARSE *cusp; 2143 CsrMatrix *matrix; 2144 2145 PetscFunctionBegin; 2146 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2147 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 2148 cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 2149 PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL"); 2150 matrix = (CsrMatrix *)cusp->mat->mat; 2151 2152 if (i) { 2153 #if !defined(PETSC_USE_64BIT_INDICES) 2154 *i = matrix->row_offsets->data().get(); 2155 #else 2156 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 2157 #endif 2158 } 2159 if (j) { 2160 #if !defined(PETSC_USE_64BIT_INDICES) 2161 *j = matrix->column_indices->data().get(); 2162 #else 2163 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 2164 #endif 2165 } 2166 if (a) *a = matrix->values->data().get(); 2167 if (mtype) *mtype = PETSC_MEMTYPE_CUDA; 2168 PetscFunctionReturn(0); 2169 } 2170 2171 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) { 2172 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 2173 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 2174 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2175 PetscInt m = A->rmap->n, *ii, *ridx, tmp; 2176 cusparseStatus_t stat; 2177 PetscBool both = PETSC_TRUE; 2178 2179 PetscFunctionBegin; 2180 PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU"); 2181 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 2182 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 2183 CsrMatrix *matrix; 2184 matrix = (CsrMatrix *)cusparsestruct->mat->mat; 2185 2186 PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values"); 2187 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2188 matrix->values->assign(a->a, a->a + a->nz); 2189 PetscCallCUDA(WaitForCUDA()); 2190 PetscCall(PetscLogCpuToGpu((a->nz) * sizeof(PetscScalar))); 2191 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2192 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 2193 } else { 2194 PetscInt nnz; 2195 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2196 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format)); 2197 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 2198 delete cusparsestruct->workVector; 2199 delete cusparsestruct->rowoffsets_gpu; 2200 cusparsestruct->workVector = NULL; 2201 cusparsestruct->rowoffsets_gpu = NULL; 2202 try { 2203 if (a->compressedrow.use) { 2204 m = a->compressedrow.nrows; 2205 ii = a->compressedrow.i; 2206 ridx = a->compressedrow.rindex; 2207 } else { 2208 m = A->rmap->n; 2209 ii = a->i; 2210 ridx = NULL; 2211 } 2212 PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data"); 2213 if (!a->a) { 2214 nnz = ii[m]; 2215 both = PETSC_FALSE; 2216 } else nnz = a->nz; 2217 PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data"); 2218 2219 /* create cusparse matrix */ 2220 cusparsestruct->nrows = m; 2221 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 2222 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr)); 2223 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO)); 2224 PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 2225 2226 PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar))); 2227 PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero), sizeof(PetscScalar))); 2228 PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar))); 2229 PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2230 PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2231 PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2232 PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2233 2234 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 2235 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2236 /* set the matrix */ 2237 CsrMatrix *mat = new CsrMatrix; 2238 mat->num_rows = m; 2239 mat->num_cols = A->cmap->n; 2240 mat->num_entries = nnz; 2241 mat->row_offsets = new THRUSTINTARRAY32(m + 1); 2242 mat->row_offsets->assign(ii, ii + m + 1); 2243 2244 mat->column_indices = new THRUSTINTARRAY32(nnz); 2245 mat->column_indices->assign(a->j, a->j + nnz); 2246 2247 mat->values = new THRUSTARRAY(nnz); 2248 if (a->a) mat->values->assign(a->a, a->a + nnz); 2249 2250 /* assign the pointer */ 2251 matstruct->mat = mat; 2252 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2253 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 2254 stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2255 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2256 PetscCallCUSPARSE(stat); 2257 } 2258 #endif 2259 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 2260 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2261 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2262 #else 2263 CsrMatrix *mat = new CsrMatrix; 2264 mat->num_rows = m; 2265 mat->num_cols = A->cmap->n; 2266 mat->num_entries = nnz; 2267 mat->row_offsets = new THRUSTINTARRAY32(m + 1); 2268 mat->row_offsets->assign(ii, ii + m + 1); 2269 2270 mat->column_indices = new THRUSTINTARRAY32(nnz); 2271 mat->column_indices->assign(a->j, a->j + nnz); 2272 2273 mat->values = new THRUSTARRAY(nnz); 2274 if (a->a) mat->values->assign(a->a, a->a + nnz); 2275 2276 cusparseHybMat_t hybMat; 2277 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 2278 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 2279 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition); 2280 PetscCallCUSPARSE(stat); 2281 /* assign the pointer */ 2282 matstruct->mat = hybMat; 2283 2284 if (mat) { 2285 if (mat->values) delete (THRUSTARRAY *)mat->values; 2286 if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices; 2287 if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets; 2288 delete (CsrMatrix *)mat; 2289 } 2290 #endif 2291 } 2292 2293 /* assign the compressed row indices */ 2294 if (a->compressedrow.use) { 2295 cusparsestruct->workVector = new THRUSTARRAY(m); 2296 matstruct->cprowIndices = new THRUSTINTARRAY(m); 2297 matstruct->cprowIndices->assign(ridx, ridx + m); 2298 tmp = m; 2299 } else { 2300 cusparsestruct->workVector = NULL; 2301 matstruct->cprowIndices = NULL; 2302 tmp = 0; 2303 } 2304 PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar))); 2305 2306 /* assign the pointer */ 2307 cusparsestruct->mat = matstruct; 2308 } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); } 2309 PetscCallCUDA(WaitForCUDA()); 2310 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2311 cusparsestruct->nonzerostate = A->nonzerostate; 2312 } 2313 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 2314 } 2315 PetscFunctionReturn(0); 2316 } 2317 2318 struct VecCUDAPlusEquals { 2319 template <typename Tuple> 2320 __host__ __device__ void operator()(Tuple t) { 2321 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 2322 } 2323 }; 2324 2325 struct VecCUDAEquals { 2326 template <typename Tuple> 2327 __host__ __device__ void operator()(Tuple t) { 2328 thrust::get<1>(t) = thrust::get<0>(t); 2329 } 2330 }; 2331 2332 struct VecCUDAEqualsReverse { 2333 template <typename Tuple> 2334 __host__ __device__ void operator()(Tuple t) { 2335 thrust::get<0>(t) = thrust::get<1>(t); 2336 } 2337 }; 2338 2339 struct MatMatCusparse { 2340 PetscBool cisdense; 2341 PetscScalar *Bt; 2342 Mat X; 2343 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2344 PetscLogDouble flops; 2345 CsrMatrix *Bcsr; 2346 2347 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2348 cusparseSpMatDescr_t matSpBDescr; 2349 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2350 cusparseDnMatDescr_t matBDescr; 2351 cusparseDnMatDescr_t matCDescr; 2352 PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/ 2353 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2354 void *dBuffer4; 2355 void *dBuffer5; 2356 #endif 2357 size_t mmBufferSize; 2358 void *mmBuffer; 2359 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2360 cusparseSpGEMMDescr_t spgemmDesc; 2361 #endif 2362 }; 2363 2364 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) { 2365 MatMatCusparse *mmdata = (MatMatCusparse *)data; 2366 2367 PetscFunctionBegin; 2368 PetscCallCUDA(cudaFree(mmdata->Bt)); 2369 delete mmdata->Bcsr; 2370 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2371 if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr)); 2372 if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2373 if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2374 if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2375 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2376 if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4)); 2377 if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5)); 2378 #endif 2379 if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2380 if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2)); 2381 #endif 2382 PetscCall(MatDestroy(&mmdata->X)); 2383 PetscCall(PetscFree(data)); 2384 PetscFunctionReturn(0); 2385 } 2386 2387 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat, Mat, Mat, PetscBool, PetscBool); 2388 2389 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) { 2390 Mat_Product *product = C->product; 2391 Mat A, B; 2392 PetscInt m, n, blda, clda; 2393 PetscBool flg, biscuda; 2394 Mat_SeqAIJCUSPARSE *cusp; 2395 cusparseStatus_t stat; 2396 cusparseOperation_t opA; 2397 const PetscScalar *barray; 2398 PetscScalar *carray; 2399 MatMatCusparse *mmdata; 2400 Mat_SeqAIJCUSPARSEMultStruct *mat; 2401 CsrMatrix *csrmat; 2402 2403 PetscFunctionBegin; 2404 MatCheckProduct(C, 1); 2405 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2406 mmdata = (MatMatCusparse *)product->data; 2407 A = product->A; 2408 B = product->B; 2409 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2410 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2411 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2412 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2413 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2414 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2415 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2416 switch (product->type) { 2417 case MATPRODUCT_AB: 2418 case MATPRODUCT_PtAP: 2419 mat = cusp->mat; 2420 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2421 m = A->rmap->n; 2422 n = B->cmap->n; 2423 break; 2424 case MATPRODUCT_AtB: 2425 if (!A->form_explicit_transpose) { 2426 mat = cusp->mat; 2427 opA = CUSPARSE_OPERATION_TRANSPOSE; 2428 } else { 2429 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2430 mat = cusp->matTranspose; 2431 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2432 } 2433 m = A->cmap->n; 2434 n = B->cmap->n; 2435 break; 2436 case MATPRODUCT_ABt: 2437 case MATPRODUCT_RARt: 2438 mat = cusp->mat; 2439 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2440 m = A->rmap->n; 2441 n = B->rmap->n; 2442 break; 2443 default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2444 } 2445 PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 2446 csrmat = (CsrMatrix *)mat->mat; 2447 /* if the user passed a CPU matrix, copy the data to the GPU */ 2448 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda)); 2449 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B)); 2450 PetscCall(MatDenseCUDAGetArrayRead(B, &barray)); 2451 2452 PetscCall(MatDenseGetLDA(B, &blda)); 2453 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2454 PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X, &carray)); 2455 PetscCall(MatDenseGetLDA(mmdata->X, &clda)); 2456 } else { 2457 PetscCall(MatDenseCUDAGetArrayWrite(C, &carray)); 2458 PetscCall(MatDenseGetLDA(C, &clda)); 2459 } 2460 2461 PetscCall(PetscLogGpuTimeBegin()); 2462 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2463 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2464 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2465 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2466 size_t mmBufferSize; 2467 if (mmdata->initialized && mmdata->Blda != blda) { 2468 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2469 mmdata->matBDescr = NULL; 2470 } 2471 if (!mmdata->matBDescr) { 2472 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2473 mmdata->Blda = blda; 2474 } 2475 2476 if (mmdata->initialized && mmdata->Clda != clda) { 2477 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2478 mmdata->matCDescr = NULL; 2479 } 2480 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2481 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2482 mmdata->Clda = clda; 2483 } 2484 2485 if (!mat->matDescr) { 2486 stat = cusparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2487 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2488 PetscCallCUSPARSE(stat); 2489 } 2490 stat = cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize); 2491 PetscCallCUSPARSE(stat); 2492 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2493 PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2494 PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize)); 2495 mmdata->mmBufferSize = mmBufferSize; 2496 } 2497 mmdata->initialized = PETSC_TRUE; 2498 } else { 2499 /* to be safe, always update pointers of the mats */ 2500 PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr, csrmat->values->data().get())); 2501 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray)); 2502 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray)); 2503 } 2504 2505 /* do cusparseSpMM, which supports transpose on B */ 2506 stat = cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer); 2507 PetscCallCUSPARSE(stat); 2508 #else 2509 PetscInt k; 2510 /* cusparseXcsrmm does not support transpose on B */ 2511 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2512 cublasHandle_t cublasv2handle; 2513 cublasStatus_t cerr; 2514 2515 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 2516 cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n); 2517 PetscCallCUBLAS(cerr); 2518 blda = B->cmap->n; 2519 k = B->cmap->n; 2520 } else { 2521 k = B->rmap->n; 2522 } 2523 2524 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2525 stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda); 2526 PetscCallCUSPARSE(stat); 2527 #endif 2528 PetscCall(PetscLogGpuTimeEnd()); 2529 PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries)); 2530 PetscCall(MatDenseCUDARestoreArrayRead(B, &barray)); 2531 if (product->type == MATPRODUCT_RARt) { 2532 PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X, &carray)); 2533 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE)); 2534 } else if (product->type == MATPRODUCT_PtAP) { 2535 PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X, &carray)); 2536 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE)); 2537 } else { 2538 PetscCall(MatDenseCUDARestoreArrayWrite(C, &carray)); 2539 } 2540 if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C)); 2541 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B)); 2542 PetscFunctionReturn(0); 2543 } 2544 2545 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) { 2546 Mat_Product *product = C->product; 2547 Mat A, B; 2548 PetscInt m, n; 2549 PetscBool cisdense, flg; 2550 MatMatCusparse *mmdata; 2551 Mat_SeqAIJCUSPARSE *cusp; 2552 2553 PetscFunctionBegin; 2554 MatCheckProduct(C, 1); 2555 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 2556 A = product->A; 2557 B = product->B; 2558 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2559 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2560 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2561 PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2562 switch (product->type) { 2563 case MATPRODUCT_AB: 2564 m = A->rmap->n; 2565 n = B->cmap->n; 2566 break; 2567 case MATPRODUCT_AtB: 2568 m = A->cmap->n; 2569 n = B->cmap->n; 2570 break; 2571 case MATPRODUCT_ABt: 2572 m = A->rmap->n; 2573 n = B->rmap->n; 2574 break; 2575 case MATPRODUCT_PtAP: 2576 m = B->cmap->n; 2577 n = B->cmap->n; 2578 break; 2579 case MATPRODUCT_RARt: 2580 m = B->rmap->n; 2581 n = B->rmap->n; 2582 break; 2583 default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2584 } 2585 PetscCall(MatSetSizes(C, m, n, m, n)); 2586 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2587 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense)); 2588 PetscCall(MatSetType(C, MATSEQDENSECUDA)); 2589 2590 /* product data */ 2591 PetscCall(PetscNew(&mmdata)); 2592 mmdata->cisdense = cisdense; 2593 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0) 2594 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2595 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar))); 2596 #endif 2597 /* for these products we need intermediate storage */ 2598 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2599 PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X)); 2600 PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA)); 2601 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2602 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n)); 2603 } else { 2604 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n)); 2605 } 2606 } 2607 C->product->data = mmdata; 2608 C->product->destroy = MatDestroy_MatMatCusparse; 2609 2610 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2611 PetscFunctionReturn(0); 2612 } 2613 2614 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) { 2615 Mat_Product *product = C->product; 2616 Mat A, B; 2617 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 2618 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 2619 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 2620 CsrMatrix *Acsr, *Bcsr, *Ccsr; 2621 PetscBool flg; 2622 cusparseStatus_t stat; 2623 MatProductType ptype; 2624 MatMatCusparse *mmdata; 2625 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2626 cusparseSpMatDescr_t BmatSpDescr; 2627 #endif 2628 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2629 2630 PetscFunctionBegin; 2631 MatCheckProduct(C, 1); 2632 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2633 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg)); 2634 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name); 2635 mmdata = (MatMatCusparse *)C->product->data; 2636 A = product->A; 2637 B = product->B; 2638 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2639 mmdata->reusesym = PETSC_FALSE; 2640 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2641 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2642 Cmat = Ccusp->mat; 2643 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]); 2644 Ccsr = (CsrMatrix *)Cmat->mat; 2645 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2646 goto finalize; 2647 } 2648 if (!c->nz) goto finalize; 2649 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2650 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2651 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 2652 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 2653 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2654 PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2655 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2656 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 2657 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2658 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2659 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2660 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2661 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2662 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2663 2664 ptype = product->type; 2665 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 2666 ptype = MATPRODUCT_AB; 2667 PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric"); 2668 } 2669 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 2670 ptype = MATPRODUCT_AB; 2671 PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric"); 2672 } 2673 switch (ptype) { 2674 case MATPRODUCT_AB: 2675 Amat = Acusp->mat; 2676 Bmat = Bcusp->mat; 2677 break; 2678 case MATPRODUCT_AtB: 2679 Amat = Acusp->matTranspose; 2680 Bmat = Bcusp->mat; 2681 break; 2682 case MATPRODUCT_ABt: 2683 Amat = Acusp->mat; 2684 Bmat = Bcusp->matTranspose; 2685 break; 2686 default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2687 } 2688 Cmat = Ccusp->mat; 2689 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 2690 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 2691 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]); 2692 Acsr = (CsrMatrix *)Amat->mat; 2693 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */ 2694 Ccsr = (CsrMatrix *)Cmat->mat; 2695 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 2696 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 2697 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2698 PetscCall(PetscLogGpuTimeBegin()); 2699 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2700 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2701 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2702 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2703 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2704 PetscCallCUSPARSE(stat); 2705 #else 2706 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 2707 PetscCallCUSPARSE(stat); 2708 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2709 PetscCallCUSPARSE(stat); 2710 #endif 2711 #else 2712 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 2713 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 2714 PetscCallCUSPARSE(stat); 2715 #endif 2716 PetscCall(PetscLogGpuFlops(mmdata->flops)); 2717 PetscCallCUDA(WaitForCUDA()); 2718 PetscCall(PetscLogGpuTimeEnd()); 2719 C->offloadmask = PETSC_OFFLOAD_GPU; 2720 finalize: 2721 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2722 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 2723 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 2724 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 2725 c->reallocs = 0; 2726 C->info.mallocs += 0; 2727 C->info.nz_unneeded = 0; 2728 C->assembled = C->was_assembled = PETSC_TRUE; 2729 C->num_ass++; 2730 PetscFunctionReturn(0); 2731 } 2732 2733 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) { 2734 Mat_Product *product = C->product; 2735 Mat A, B; 2736 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 2737 Mat_SeqAIJ *a, *b, *c; 2738 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 2739 CsrMatrix *Acsr, *Bcsr, *Ccsr; 2740 PetscInt i, j, m, n, k; 2741 PetscBool flg; 2742 cusparseStatus_t stat; 2743 MatProductType ptype; 2744 MatMatCusparse *mmdata; 2745 PetscLogDouble flops; 2746 PetscBool biscompressed, ciscompressed; 2747 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2748 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2749 cusparseSpMatDescr_t BmatSpDescr; 2750 #else 2751 int cnz; 2752 #endif 2753 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2754 2755 PetscFunctionBegin; 2756 MatCheckProduct(C, 1); 2757 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 2758 A = product->A; 2759 B = product->B; 2760 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2761 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2762 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 2763 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 2764 a = (Mat_SeqAIJ *)A->data; 2765 b = (Mat_SeqAIJ *)B->data; 2766 /* product data */ 2767 PetscCall(PetscNew(&mmdata)); 2768 C->product->data = mmdata; 2769 C->product->destroy = MatDestroy_MatMatCusparse; 2770 2771 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2772 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2773 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 2774 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 2775 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2776 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2777 2778 ptype = product->type; 2779 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 2780 ptype = MATPRODUCT_AB; 2781 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 2782 } 2783 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 2784 ptype = MATPRODUCT_AB; 2785 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 2786 } 2787 biscompressed = PETSC_FALSE; 2788 ciscompressed = PETSC_FALSE; 2789 switch (ptype) { 2790 case MATPRODUCT_AB: 2791 m = A->rmap->n; 2792 n = B->cmap->n; 2793 k = A->cmap->n; 2794 Amat = Acusp->mat; 2795 Bmat = Bcusp->mat; 2796 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2797 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2798 break; 2799 case MATPRODUCT_AtB: 2800 m = A->cmap->n; 2801 n = B->cmap->n; 2802 k = A->rmap->n; 2803 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2804 Amat = Acusp->matTranspose; 2805 Bmat = Bcusp->mat; 2806 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2807 break; 2808 case MATPRODUCT_ABt: 2809 m = A->rmap->n; 2810 n = B->rmap->n; 2811 k = A->cmap->n; 2812 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 2813 Amat = Acusp->mat; 2814 Bmat = Bcusp->matTranspose; 2815 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2816 break; 2817 default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2818 } 2819 2820 /* create cusparse matrix */ 2821 PetscCall(MatSetSizes(C, m, n, m, n)); 2822 PetscCall(MatSetType(C, MATSEQAIJCUSPARSE)); 2823 c = (Mat_SeqAIJ *)C->data; 2824 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2825 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2826 Ccsr = new CsrMatrix; 2827 2828 c->compressedrow.use = ciscompressed; 2829 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2830 c->compressedrow.nrows = a->compressedrow.nrows; 2831 PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex)); 2832 PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows)); 2833 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2834 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2835 Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows); 2836 } else { 2837 c->compressedrow.nrows = 0; 2838 c->compressedrow.i = NULL; 2839 c->compressedrow.rindex = NULL; 2840 Ccusp->workVector = NULL; 2841 Cmat->cprowIndices = NULL; 2842 } 2843 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2844 Ccusp->mat = Cmat; 2845 Ccusp->mat->mat = Ccsr; 2846 Ccsr->num_rows = Ccusp->nrows; 2847 Ccsr->num_cols = n; 2848 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1); 2849 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 2850 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 2851 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 2852 PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar))); 2853 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar))); 2854 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 2855 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2856 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2857 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2858 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2859 thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0); 2860 c->nz = 0; 2861 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2862 Ccsr->values = new THRUSTARRAY(c->nz); 2863 goto finalizesym; 2864 } 2865 2866 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 2867 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 2868 Acsr = (CsrMatrix *)Amat->mat; 2869 if (!biscompressed) { 2870 Bcsr = (CsrMatrix *)Bmat->mat; 2871 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2872 BmatSpDescr = Bmat->matDescr; 2873 #endif 2874 } else { /* we need to use row offsets for the full matrix */ 2875 CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat; 2876 Bcsr = new CsrMatrix; 2877 Bcsr->num_rows = B->rmap->n; 2878 Bcsr->num_cols = cBcsr->num_cols; 2879 Bcsr->num_entries = cBcsr->num_entries; 2880 Bcsr->column_indices = cBcsr->column_indices; 2881 Bcsr->values = cBcsr->values; 2882 if (!Bcusp->rowoffsets_gpu) { 2883 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2884 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 2885 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 2886 } 2887 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2888 mmdata->Bcsr = Bcsr; 2889 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2890 if (Bcsr->num_rows && Bcsr->num_cols) { 2891 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2892 PetscCallCUSPARSE(stat); 2893 } 2894 BmatSpDescr = mmdata->matSpBDescr; 2895 #endif 2896 } 2897 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 2898 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 2899 /* precompute flops count */ 2900 if (ptype == MATPRODUCT_AB) { 2901 for (i = 0, flops = 0; i < A->rmap->n; i++) { 2902 const PetscInt st = a->i[i]; 2903 const PetscInt en = a->i[i + 1]; 2904 for (j = st; j < en; j++) { 2905 const PetscInt brow = a->j[j]; 2906 flops += 2. * (b->i[brow + 1] - b->i[brow]); 2907 } 2908 } 2909 } else if (ptype == MATPRODUCT_AtB) { 2910 for (i = 0, flops = 0; i < A->rmap->n; i++) { 2911 const PetscInt anzi = a->i[i + 1] - a->i[i]; 2912 const PetscInt bnzi = b->i[i + 1] - b->i[i]; 2913 flops += (2. * anzi) * bnzi; 2914 } 2915 } else { /* TODO */ 2916 flops = 0.; 2917 } 2918 2919 mmdata->flops = flops; 2920 PetscCall(PetscLogGpuTimeBegin()); 2921 2922 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2923 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2924 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, NULL, NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2925 PetscCallCUSPARSE(stat); 2926 PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 2927 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2928 { 2929 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 2930 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 2931 */ 2932 void *dBuffer1 = NULL; 2933 void *dBuffer2 = NULL; 2934 void *dBuffer3 = NULL; 2935 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 2936 size_t bufferSize1 = 0; 2937 size_t bufferSize2 = 0; 2938 size_t bufferSize3 = 0; 2939 size_t bufferSize4 = 0; 2940 size_t bufferSize5 = 0; 2941 2942 /*----------------------------------------------------------------------*/ 2943 /* ask bufferSize1 bytes for external memory */ 2944 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL); 2945 PetscCallCUSPARSE(stat); 2946 PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1)); 2947 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2948 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1); 2949 PetscCallCUSPARSE(stat); 2950 2951 /*----------------------------------------------------------------------*/ 2952 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL); 2953 PetscCallCUSPARSE(stat); 2954 PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2)); 2955 PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3)); 2956 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4)); 2957 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4); 2958 PetscCallCUSPARSE(stat); 2959 PetscCallCUDA(cudaFree(dBuffer1)); 2960 PetscCallCUDA(cudaFree(dBuffer2)); 2961 2962 /*----------------------------------------------------------------------*/ 2963 /* get matrix C non-zero entries C_nnz1 */ 2964 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 2965 c->nz = (PetscInt)C_nnz1; 2966 /* allocate matrix C */ 2967 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2968 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2969 Ccsr->values = new THRUSTARRAY(c->nz); 2970 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2971 /* update matC with the new pointers */ 2972 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 2973 PetscCallCUSPARSE(stat); 2974 2975 /*----------------------------------------------------------------------*/ 2976 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL); 2977 PetscCallCUSPARSE(stat); 2978 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5)); 2979 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5); 2980 PetscCallCUSPARSE(stat); 2981 PetscCallCUDA(cudaFree(dBuffer3)); 2982 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2983 PetscCallCUSPARSE(stat); 2984 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024)); 2985 } 2986 #else 2987 size_t bufSize2; 2988 /* ask bufferSize bytes for external memory */ 2989 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL); 2990 PetscCallCUSPARSE(stat); 2991 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2)); 2992 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2993 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2); 2994 PetscCallCUSPARSE(stat); 2995 /* ask bufferSize again bytes for external memory */ 2996 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL); 2997 PetscCallCUSPARSE(stat); 2998 /* The CUSPARSE documentation is not clear, nor the API 2999 We need both buffers to perform the operations properly! 3000 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 3001 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 3002 is stored in the descriptor! What a messy API... */ 3003 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize)); 3004 /* compute the intermediate product of A * B */ 3005 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 3006 PetscCallCUSPARSE(stat); 3007 /* get matrix C non-zero entries C_nnz1 */ 3008 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3009 c->nz = (PetscInt)C_nnz1; 3010 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024, 3011 mmdata->mmBufferSize / 1024)); 3012 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3013 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3014 Ccsr->values = new THRUSTARRAY(c->nz); 3015 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3016 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 3017 PetscCallCUSPARSE(stat); 3018 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 3019 PetscCallCUSPARSE(stat); 3020 #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3021 #else 3022 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST)); 3023 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 3024 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz); 3025 PetscCallCUSPARSE(stat); 3026 c->nz = cnz; 3027 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3028 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3029 Ccsr->values = new THRUSTARRAY(c->nz); 3030 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3031 3032 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3033 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 3034 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 3035 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 3036 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 3037 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 3038 PetscCallCUSPARSE(stat); 3039 #endif 3040 PetscCall(PetscLogGpuFlops(mmdata->flops)); 3041 PetscCall(PetscLogGpuTimeEnd()); 3042 finalizesym: 3043 c->singlemalloc = PETSC_FALSE; 3044 c->free_a = PETSC_TRUE; 3045 c->free_ij = PETSC_TRUE; 3046 PetscCall(PetscMalloc1(m + 1, &c->i)); 3047 PetscCall(PetscMalloc1(c->nz, &c->j)); 3048 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 3049 PetscInt *d_i = c->i; 3050 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 3051 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 3052 ii = *Ccsr->row_offsets; 3053 jj = *Ccsr->column_indices; 3054 if (ciscompressed) d_i = c->compressedrow.i; 3055 PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3056 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3057 } else { 3058 PetscInt *d_i = c->i; 3059 if (ciscompressed) d_i = c->compressedrow.i; 3060 PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3061 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3062 } 3063 if (ciscompressed) { /* need to expand host row offsets */ 3064 PetscInt r = 0; 3065 c->i[0] = 0; 3066 for (k = 0; k < c->compressedrow.nrows; k++) { 3067 const PetscInt next = c->compressedrow.rindex[k]; 3068 const PetscInt old = c->compressedrow.i[k]; 3069 for (; r < next; r++) c->i[r + 1] = old; 3070 } 3071 for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows]; 3072 } 3073 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 3074 PetscCall(PetscMalloc1(m, &c->ilen)); 3075 PetscCall(PetscMalloc1(m, &c->imax)); 3076 c->maxnz = c->nz; 3077 c->nonzerorowcnt = 0; 3078 c->rmax = 0; 3079 for (k = 0; k < m; k++) { 3080 const PetscInt nn = c->i[k + 1] - c->i[k]; 3081 c->ilen[k] = c->imax[k] = nn; 3082 c->nonzerorowcnt += (PetscInt) !!nn; 3083 c->rmax = PetscMax(c->rmax, nn); 3084 } 3085 PetscCall(MatMarkDiagonal_SeqAIJ(C)); 3086 PetscCall(PetscMalloc1(c->nz, &c->a)); 3087 Ccsr->num_entries = c->nz; 3088 3089 C->nonzerostate++; 3090 PetscCall(PetscLayoutSetUp(C->rmap)); 3091 PetscCall(PetscLayoutSetUp(C->cmap)); 3092 Ccusp->nonzerostate = C->nonzerostate; 3093 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3094 C->preallocated = PETSC_TRUE; 3095 C->assembled = PETSC_FALSE; 3096 C->was_assembled = PETSC_FALSE; 3097 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 3098 mmdata->reusesym = PETSC_TRUE; 3099 C->offloadmask = PETSC_OFFLOAD_GPU; 3100 } 3101 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3102 PetscFunctionReturn(0); 3103 } 3104 3105 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 3106 3107 /* handles sparse or dense B */ 3108 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) { 3109 Mat_Product *product = mat->product; 3110 PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE; 3111 3112 PetscFunctionBegin; 3113 MatCheckProduct(mat, 1); 3114 PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense)); 3115 if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp)); 3116 if (product->type == MATPRODUCT_ABC) { 3117 Ciscusp = PETSC_FALSE; 3118 if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp)); 3119 } 3120 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 3121 PetscBool usecpu = PETSC_FALSE; 3122 switch (product->type) { 3123 case MATPRODUCT_AB: 3124 if (product->api_user) { 3125 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat"); 3126 PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3127 PetscOptionsEnd(); 3128 } else { 3129 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat"); 3130 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3131 PetscOptionsEnd(); 3132 } 3133 break; 3134 case MATPRODUCT_AtB: 3135 if (product->api_user) { 3136 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat"); 3137 PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3138 PetscOptionsEnd(); 3139 } else { 3140 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat"); 3141 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3142 PetscOptionsEnd(); 3143 } 3144 break; 3145 case MATPRODUCT_PtAP: 3146 if (product->api_user) { 3147 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat"); 3148 PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3149 PetscOptionsEnd(); 3150 } else { 3151 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat"); 3152 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3153 PetscOptionsEnd(); 3154 } 3155 break; 3156 case MATPRODUCT_RARt: 3157 if (product->api_user) { 3158 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat"); 3159 PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3160 PetscOptionsEnd(); 3161 } else { 3162 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat"); 3163 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3164 PetscOptionsEnd(); 3165 } 3166 break; 3167 case MATPRODUCT_ABC: 3168 if (product->api_user) { 3169 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat"); 3170 PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3171 PetscOptionsEnd(); 3172 } else { 3173 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat"); 3174 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3175 PetscOptionsEnd(); 3176 } 3177 break; 3178 default: break; 3179 } 3180 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 3181 } 3182 /* dispatch */ 3183 if (isdense) { 3184 switch (product->type) { 3185 case MATPRODUCT_AB: 3186 case MATPRODUCT_AtB: 3187 case MATPRODUCT_ABt: 3188 case MATPRODUCT_PtAP: 3189 case MATPRODUCT_RARt: 3190 if (product->A->boundtocpu) { 3191 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 3192 } else { 3193 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 3194 } 3195 break; 3196 case MATPRODUCT_ABC: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; break; 3197 default: break; 3198 } 3199 } else if (Biscusp && Ciscusp) { 3200 switch (product->type) { 3201 case MATPRODUCT_AB: 3202 case MATPRODUCT_AtB: 3203 case MATPRODUCT_ABt: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; break; 3204 case MATPRODUCT_PtAP: 3205 case MATPRODUCT_RARt: 3206 case MATPRODUCT_ABC: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; break; 3207 default: break; 3208 } 3209 } else { /* fallback for AIJ */ 3210 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 3211 } 3212 PetscFunctionReturn(0); 3213 } 3214 3215 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) { 3216 PetscFunctionBegin; 3217 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE)); 3218 PetscFunctionReturn(0); 3219 } 3220 3221 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) { 3222 PetscFunctionBegin; 3223 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE)); 3224 PetscFunctionReturn(0); 3225 } 3226 3227 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) { 3228 PetscFunctionBegin; 3229 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE)); 3230 PetscFunctionReturn(0); 3231 } 3232 3233 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) { 3234 PetscFunctionBegin; 3235 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE)); 3236 PetscFunctionReturn(0); 3237 } 3238 3239 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) { 3240 PetscFunctionBegin; 3241 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE)); 3242 PetscFunctionReturn(0); 3243 } 3244 3245 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y) { 3246 int i = blockIdx.x * blockDim.x + threadIdx.x; 3247 if (i < n) y[idx[i]] += x[i]; 3248 } 3249 3250 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 3251 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm) { 3252 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3253 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 3254 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3255 PetscScalar *xarray, *zarray, *dptr, *beta, *xptr; 3256 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3257 PetscBool compressed; 3258 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3259 PetscInt nx, ny; 3260 #endif 3261 3262 PetscFunctionBegin; 3263 PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported"); 3264 if (!a->nz) { 3265 if (!yy) PetscCall(VecSet_SeqCUDA(zz, 0)); 3266 else PetscCall(VecCopy_SeqCUDA(yy, zz)); 3267 PetscFunctionReturn(0); 3268 } 3269 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 3270 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3271 if (!trans) { 3272 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3273 PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3274 } else { 3275 if (herm || !A->form_explicit_transpose) { 3276 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3277 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3278 } else { 3279 if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3280 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 3281 } 3282 } 3283 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3284 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3285 3286 try { 3287 PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray)); 3288 if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get uptodate zarray on GPU */ 3289 else PetscCall(VecCUDAGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */ 3290 3291 PetscCall(PetscLogGpuTimeBegin()); 3292 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3293 /* z = A x + beta y. 3294 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3295 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3296 */ 3297 xptr = xarray; 3298 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3299 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3300 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3301 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3302 allocated to accommodate different uses. So we get the length info directly from mat. 3303 */ 3304 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3305 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3306 nx = mat->num_cols; 3307 ny = mat->num_rows; 3308 } 3309 #endif 3310 } else { 3311 /* z = A^T x + beta y 3312 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3313 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3314 */ 3315 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3316 dptr = zarray; 3317 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3318 if (compressed) { /* Scatter x to work vector */ 3319 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3320 3321 thrust::for_each( 3322 #if PetscDefined(HAVE_THRUST_ASYNC) 3323 thrust::cuda::par.on(PetscDefaultCudaStream), 3324 #endif 3325 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3326 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse()); 3327 } 3328 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3329 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3330 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3331 nx = mat->num_rows; 3332 ny = mat->num_cols; 3333 } 3334 #endif 3335 } 3336 3337 /* csr_spmv does y = alpha op(A) x + beta y */ 3338 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3339 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3340 PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3341 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3342 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype)); 3343 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype)); 3344 PetscCallCUSPARSE( 3345 cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize)); 3346 PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize)); 3347 3348 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3349 } else { 3350 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3351 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr)); 3352 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr)); 3353 } 3354 3355 PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 3356 matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer)); 3357 #else 3358 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3359 PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr)); 3360 #endif 3361 } else { 3362 if (cusparsestruct->nrows) { 3363 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3364 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3365 #else 3366 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3367 PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr)); 3368 #endif 3369 } 3370 } 3371 PetscCall(PetscLogGpuTimeEnd()); 3372 3373 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3374 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3375 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3376 PetscCall(VecCopy_SeqCUDA(yy, zz)); /* zz = yy */ 3377 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3378 PetscCall(VecAXPY_SeqCUDA(zz, 1.0, yy)); /* zz += yy */ 3379 } 3380 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3381 PetscCall(VecSet_SeqCUDA(zz, 0)); 3382 } 3383 3384 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3385 if (compressed) { 3386 PetscCall(PetscLogGpuTimeBegin()); 3387 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3388 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3389 prevent that. So I just add a ScatterAdd kernel. 3390 */ 3391 #if 0 3392 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3393 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3394 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3395 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3396 VecCUDAPlusEquals()); 3397 #else 3398 PetscInt n = matstruct->cprowIndices->size(); 3399 ScatterAdd<<<(n + 255) / 256, 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray); 3400 #endif 3401 PetscCall(PetscLogGpuTimeEnd()); 3402 } 3403 } else { 3404 if (yy && yy != zz) { PetscCall(VecAXPY_SeqCUDA(zz, 1.0, yy)); /* zz += yy */ } 3405 } 3406 PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray)); 3407 if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray)); 3408 else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray)); 3409 } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); } 3410 if (yy) { 3411 PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 3412 } else { 3413 PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 3414 } 3415 PetscFunctionReturn(0); 3416 } 3417 3418 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) { 3419 PetscFunctionBegin; 3420 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE)); 3421 PetscFunctionReturn(0); 3422 } 3423 3424 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode) { 3425 PetscObjectState onnz = A->nonzerostate; 3426 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 3427 3428 PetscFunctionBegin; 3429 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 3430 if (onnz != A->nonzerostate && cusp->deviceMat) { 3431 PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n")); 3432 PetscCallCUDA(cudaFree(cusp->deviceMat)); 3433 cusp->deviceMat = NULL; 3434 } 3435 PetscFunctionReturn(0); 3436 } 3437 3438 /* --------------------------------------------------------------------------------*/ 3439 /*@ 3440 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format 3441 (the default parallel PETSc format). This matrix will ultimately pushed down 3442 to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix 3443 assembly performance the user should preallocate the matrix storage by setting 3444 the parameter nz (or the array nnz). By setting these parameters accurately, 3445 performance during matrix assembly can be increased by more than a factor of 50. 3446 3447 Collective 3448 3449 Input Parameters: 3450 + comm - MPI communicator, set to `PETSC_COMM_SELF` 3451 . m - number of rows 3452 . n - number of columns 3453 . nz - number of nonzeros per row (same for all rows) 3454 - nnz - array containing the number of nonzeros in the various rows 3455 (possibly different for each row) or NULL 3456 3457 Output Parameter: 3458 . A - the matrix 3459 3460 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 3461 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3462 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 3463 3464 Notes: 3465 If nnz is given then nz is ignored 3466 3467 The AIJ format, also called 3468 compressed row storage, is fully compatible with standard Fortran 77 3469 storage. That is, the stored row and column indices can begin at 3470 either one (as in Fortran) or zero. See the users' manual for details. 3471 3472 Specify the preallocated storage with either nz or nnz (not both). 3473 Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory 3474 allocation. For large problems you MUST preallocate memory or you 3475 will get TERRIBLE performance, see the users' manual chapter on matrices. 3476 3477 By default, this format uses inodes (identical nodes) when possible, to 3478 improve numerical efficiency of matrix-vector products and solves. We 3479 search for consecutive rows with the same nonzero structure, thereby 3480 reusing matrix information to achieve increased efficiency. 3481 3482 Level: intermediate 3483 3484 .seealso: `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE` 3485 @*/ 3486 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) { 3487 PetscFunctionBegin; 3488 PetscCall(MatCreate(comm, A)); 3489 PetscCall(MatSetSizes(*A, m, n, m, n)); 3490 PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE)); 3491 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 3492 PetscFunctionReturn(0); 3493 } 3494 3495 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) { 3496 PetscFunctionBegin; 3497 if (A->factortype == MAT_FACTOR_NONE) { 3498 PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE **)&A->spptr)); 3499 } else { 3500 PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr)); 3501 } 3502 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 3503 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL)); 3504 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL)); 3505 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 3506 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 3507 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 3508 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 3509 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 3510 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 3511 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL)); 3512 PetscCall(MatDestroy_SeqAIJ(A)); 3513 PetscFunctionReturn(0); 3514 } 3515 3516 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *); 3517 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool); 3518 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B) { 3519 PetscFunctionBegin; 3520 PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B)); 3521 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B)); 3522 PetscFunctionReturn(0); 3523 } 3524 3525 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str) { 3526 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 3527 Mat_SeqAIJCUSPARSE *cy; 3528 Mat_SeqAIJCUSPARSE *cx; 3529 PetscScalar *ay; 3530 const PetscScalar *ax; 3531 CsrMatrix *csry, *csrx; 3532 3533 PetscFunctionBegin; 3534 cy = (Mat_SeqAIJCUSPARSE *)Y->spptr; 3535 cx = (Mat_SeqAIJCUSPARSE *)X->spptr; 3536 if (X->ops->axpy != Y->ops->axpy) { 3537 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3538 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3539 PetscFunctionReturn(0); 3540 } 3541 /* if we are here, it means both matrices are bound to GPU */ 3542 PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y)); 3543 PetscCall(MatSeqAIJCUSPARSECopyToGPU(X)); 3544 PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3545 PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3546 csry = (CsrMatrix *)cy->mat->mat; 3547 csrx = (CsrMatrix *)cx->mat->mat; 3548 /* see if we can turn this into a cublas axpy */ 3549 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3550 bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin()); 3551 if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin()); 3552 if (eq) str = SAME_NONZERO_PATTERN; 3553 } 3554 /* spgeam is buggy with one column */ 3555 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3556 3557 if (str == SUBSET_NONZERO_PATTERN) { 3558 PetscScalar b = 1.0; 3559 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3560 size_t bufferSize; 3561 void *buffer; 3562 #endif 3563 3564 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 3565 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3566 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST)); 3567 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3568 PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3569 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize)); 3570 PetscCallCUDA(cudaMalloc(&buffer, bufferSize)); 3571 PetscCall(PetscLogGpuTimeBegin()); 3572 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3573 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer)); 3574 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3575 PetscCall(PetscLogGpuTimeEnd()); 3576 PetscCallCUDA(cudaFree(buffer)); 3577 #else 3578 PetscCall(PetscLogGpuTimeBegin()); 3579 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3580 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get())); 3581 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3582 PetscCall(PetscLogGpuTimeEnd()); 3583 #endif 3584 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3585 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 3586 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3587 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3588 } else if (str == SAME_NONZERO_PATTERN) { 3589 cublasHandle_t cublasv2handle; 3590 PetscBLASInt one = 1, bnz = 1; 3591 3592 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 3593 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3594 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3595 PetscCall(PetscBLASIntCast(x->nz, &bnz)); 3596 PetscCall(PetscLogGpuTimeBegin()); 3597 PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one)); 3598 PetscCall(PetscLogGpuFlops(2.0 * bnz)); 3599 PetscCall(PetscLogGpuTimeEnd()); 3600 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 3601 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3602 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3603 } else { 3604 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3605 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3606 } 3607 PetscFunctionReturn(0); 3608 } 3609 3610 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a) { 3611 Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data; 3612 PetscScalar *ay; 3613 cublasHandle_t cublasv2handle; 3614 PetscBLASInt one = 1, bnz = 1; 3615 3616 PetscFunctionBegin; 3617 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3618 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3619 PetscCall(PetscBLASIntCast(y->nz, &bnz)); 3620 PetscCall(PetscLogGpuTimeBegin()); 3621 PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one)); 3622 PetscCall(PetscLogGpuFlops(bnz)); 3623 PetscCall(PetscLogGpuTimeEnd()); 3624 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3625 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3626 PetscFunctionReturn(0); 3627 } 3628 3629 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) { 3630 PetscBool both = PETSC_FALSE; 3631 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3632 3633 PetscFunctionBegin; 3634 if (A->factortype == MAT_FACTOR_NONE) { 3635 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr; 3636 if (spptr->mat) { 3637 CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat; 3638 if (matrix->values) { 3639 both = PETSC_TRUE; 3640 thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3641 } 3642 } 3643 if (spptr->matTranspose) { 3644 CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat; 3645 if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3646 } 3647 } 3648 PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n])); 3649 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 3650 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3651 else A->offloadmask = PETSC_OFFLOAD_CPU; 3652 PetscFunctionReturn(0); 3653 } 3654 3655 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg) { 3656 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3657 3658 PetscFunctionBegin; 3659 if (A->factortype != MAT_FACTOR_NONE) { 3660 A->boundtocpu = flg; 3661 PetscFunctionReturn(0); 3662 } 3663 if (flg) { 3664 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 3665 3666 A->ops->scale = MatScale_SeqAIJ; 3667 A->ops->axpy = MatAXPY_SeqAIJ; 3668 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3669 A->ops->mult = MatMult_SeqAIJ; 3670 A->ops->multadd = MatMultAdd_SeqAIJ; 3671 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3672 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3673 A->ops->multhermitiantranspose = NULL; 3674 A->ops->multhermitiantransposeadd = NULL; 3675 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3676 PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps))); 3677 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 3678 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 3679 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 3680 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 3681 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 3682 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 3683 } else { 3684 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3685 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3686 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3687 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3688 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3689 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3690 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3691 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3692 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3693 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3694 a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 3695 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 3696 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 3697 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 3698 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 3699 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 3700 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE; 3701 3702 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE)); 3703 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 3704 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 3705 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE)); 3706 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE)); 3707 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 3708 } 3709 A->boundtocpu = flg; 3710 if (flg && a->inode.size) { 3711 a->inode.use = PETSC_TRUE; 3712 } else { 3713 a->inode.use = PETSC_FALSE; 3714 } 3715 PetscFunctionReturn(0); 3716 } 3717 3718 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) { 3719 Mat B; 3720 3721 PetscFunctionBegin; 3722 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */ 3723 if (reuse == MAT_INITIAL_MATRIX) { 3724 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); 3725 } else if (reuse == MAT_REUSE_MATRIX) { 3726 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); 3727 } 3728 B = *newmat; 3729 3730 PetscCall(PetscFree(B->defaultvectype)); 3731 PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype)); 3732 3733 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3734 if (B->factortype == MAT_FACTOR_NONE) { 3735 Mat_SeqAIJCUSPARSE *spptr; 3736 PetscCall(PetscNew(&spptr)); 3737 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 3738 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 3739 spptr->format = MAT_CUSPARSE_CSR; 3740 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3741 #if CUSPARSE_VERSION > 11301 3742 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 3743 #else 3744 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3745 #endif 3746 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3747 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3748 #endif 3749 B->spptr = spptr; 3750 } else { 3751 Mat_SeqAIJCUSPARSETriFactors *spptr; 3752 3753 PetscCall(PetscNew(&spptr)); 3754 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 3755 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 3756 B->spptr = spptr; 3757 } 3758 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3759 } 3760 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3761 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3762 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 3763 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3764 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3765 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3766 3767 PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE)); 3768 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE)); 3769 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE)); 3770 #if defined(PETSC_HAVE_HYPRE) 3771 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE)); 3772 #endif 3773 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE)); 3774 PetscFunctionReturn(0); 3775 } 3776 3777 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) { 3778 PetscFunctionBegin; 3779 PetscCall(MatCreate_SeqAIJ(B)); 3780 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B)); 3781 PetscFunctionReturn(0); 3782 } 3783 3784 /*MC 3785 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3786 3787 A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either 3788 CSR, ELL, or Hybrid format. 3789 All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library. 3790 3791 Options Database Keys: 3792 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()` 3793 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`) during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid). 3794 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`) during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid). 3795 + -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU 3796 3797 Level: beginner 3798 3799 .seealso: `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 3800 M*/ 3801 3802 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat, MatFactorType, Mat *); 3803 3804 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) { 3805 PetscFunctionBegin; 3806 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse_band)); 3807 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse)); 3808 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse)); 3809 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse)); 3810 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse)); 3811 3812 PetscFunctionReturn(0); 3813 } 3814 3815 static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat) { 3816 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)mat->spptr; 3817 3818 PetscFunctionBegin; 3819 if (!cusp) PetscFunctionReturn(0); 3820 delete cusp->cooPerm; 3821 delete cusp->cooPerm_a; 3822 cusp->cooPerm = NULL; 3823 cusp->cooPerm_a = NULL; 3824 if (cusp->use_extended_coo) { 3825 PetscCallCUDA(cudaFree(cusp->jmap_d)); 3826 PetscCallCUDA(cudaFree(cusp->perm_d)); 3827 } 3828 cusp->use_extended_coo = PETSC_FALSE; 3829 PetscFunctionReturn(0); 3830 } 3831 3832 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) { 3833 PetscFunctionBegin; 3834 if (*cusparsestruct) { 3835 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat, (*cusparsestruct)->format)); 3836 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose, (*cusparsestruct)->format)); 3837 delete (*cusparsestruct)->workVector; 3838 delete (*cusparsestruct)->rowoffsets_gpu; 3839 delete (*cusparsestruct)->cooPerm; 3840 delete (*cusparsestruct)->cooPerm_a; 3841 delete (*cusparsestruct)->csr2csc_i; 3842 if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle)); 3843 if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d)); 3844 if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d)); 3845 PetscCall(PetscFree(*cusparsestruct)); 3846 } 3847 PetscFunctionReturn(0); 3848 } 3849 3850 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) { 3851 PetscFunctionBegin; 3852 if (*mat) { 3853 delete (*mat)->values; 3854 delete (*mat)->column_indices; 3855 delete (*mat)->row_offsets; 3856 delete *mat; 3857 *mat = 0; 3858 } 3859 PetscFunctionReturn(0); 3860 } 3861 3862 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) { 3863 PetscFunctionBegin; 3864 if (*trifactor) { 3865 if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr)); 3866 if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo)); 3867 PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 3868 if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer)); 3869 if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h)); 3870 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3871 if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer)); 3872 #endif 3873 PetscCall(PetscFree(*trifactor)); 3874 } 3875 PetscFunctionReturn(0); 3876 } 3877 3878 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format) { 3879 CsrMatrix *mat; 3880 3881 PetscFunctionBegin; 3882 if (*matstruct) { 3883 if ((*matstruct)->mat) { 3884 if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) { 3885 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3886 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3887 #else 3888 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3889 PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat)); 3890 #endif 3891 } else { 3892 mat = (CsrMatrix *)(*matstruct)->mat; 3893 CsrMatrix_Destroy(&mat); 3894 } 3895 } 3896 if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr)); 3897 delete (*matstruct)->cprowIndices; 3898 if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one)); 3899 if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero)); 3900 if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one)); 3901 3902 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3903 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3904 if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr)); 3905 for (int i = 0; i < 3; i++) { 3906 if (mdata->cuSpMV[i].initialized) { 3907 PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer)); 3908 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr)); 3909 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr)); 3910 } 3911 } 3912 #endif 3913 delete *matstruct; 3914 *matstruct = NULL; 3915 } 3916 PetscFunctionReturn(0); 3917 } 3918 3919 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors) { 3920 Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors; 3921 3922 PetscFunctionBegin; 3923 if (fs) { 3924 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr)); 3925 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr)); 3926 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose)); 3927 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose)); 3928 delete fs->rpermIndices; 3929 delete fs->cpermIndices; 3930 delete fs->workVector; 3931 fs->rpermIndices = NULL; 3932 fs->cpermIndices = NULL; 3933 fs->workVector = NULL; 3934 if (fs->a_band_d) PetscCallCUDA(cudaFree(fs->a_band_d)); 3935 if (fs->i_band_d) PetscCallCUDA(cudaFree(fs->i_band_d)); 3936 fs->init_dev_prop = PETSC_FALSE; 3937 #if CUSPARSE_VERSION >= 11500 3938 PetscCallCUDA(cudaFree(fs->csrRowPtr)); 3939 PetscCallCUDA(cudaFree(fs->csrColIdx)); 3940 PetscCallCUDA(cudaFree(fs->csrVal)); 3941 PetscCallCUDA(cudaFree(fs->X)); 3942 PetscCallCUDA(cudaFree(fs->Y)); 3943 // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */ 3944 PetscCallCUDA(cudaFree(fs->spsvBuffer_L)); 3945 PetscCallCUDA(cudaFree(fs->spsvBuffer_U)); 3946 PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt)); 3947 PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut)); 3948 PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M)); 3949 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L)); 3950 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U)); 3951 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L)); 3952 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt)); 3953 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U)); 3954 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut)); 3955 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X)); 3956 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y)); 3957 PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M)); 3958 PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M)); 3959 3960 fs->createdTransposeSpSVDescr = PETSC_FALSE; 3961 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 3962 #endif 3963 } 3964 PetscFunctionReturn(0); 3965 } 3966 3967 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors) { 3968 cusparseHandle_t handle; 3969 3970 PetscFunctionBegin; 3971 if (*trifactors) { 3972 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors)); 3973 if (handle = (*trifactors)->handle) PetscCallCUSPARSE(cusparseDestroy(handle)); 3974 PetscCall(PetscFree(*trifactors)); 3975 } 3976 PetscFunctionReturn(0); 3977 } 3978 3979 struct IJCompare { 3980 __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) { 3981 if (t1.get<0>() < t2.get<0>()) return true; 3982 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3983 return false; 3984 } 3985 }; 3986 3987 struct IJEqual { 3988 __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) { 3989 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3990 return true; 3991 } 3992 }; 3993 3994 struct IJDiff { 3995 __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 == t2 ? 0 : 1; } 3996 }; 3997 3998 struct IJSum { 3999 __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 || t2; } 4000 }; 4001 4002 #include <thrust/iterator/discard_iterator.h> 4003 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */ 4004 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) { 4005 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4006 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4007 THRUSTARRAY *cooPerm_v = NULL; 4008 thrust::device_ptr<const PetscScalar> d_v; 4009 CsrMatrix *matrix; 4010 PetscInt n; 4011 4012 PetscFunctionBegin; 4013 PetscCheck(cusp, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE struct"); 4014 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE CsrMatrix"); 4015 if (!cusp->cooPerm) { 4016 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 4017 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 4018 PetscFunctionReturn(0); 4019 } 4020 matrix = (CsrMatrix *)cusp->mat->mat; 4021 PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4022 if (!v) { 4023 if (imode == INSERT_VALUES) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 4024 goto finalize; 4025 } 4026 n = cusp->cooPerm->size(); 4027 if (isCudaMem(v)) { 4028 d_v = thrust::device_pointer_cast(v); 4029 } else { 4030 cooPerm_v = new THRUSTARRAY(n); 4031 cooPerm_v->assign(v, v + n); 4032 d_v = cooPerm_v->data(); 4033 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 4034 } 4035 PetscCall(PetscLogGpuTimeBegin()); 4036 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 4037 if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 4038 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 4039 auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()); 4040 /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 4041 cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 4042 cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 4043 */ 4044 thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), cooPerm_w->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>()); 4045 thrust::transform(cooPerm_w->begin(), cooPerm_w->end(), matrix->values->begin(), matrix->values->begin(), thrust::plus<PetscScalar>()); 4046 delete cooPerm_w; 4047 } else { 4048 /* all nonzeros in d_v[] are unique entries */ 4049 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin())); 4050 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end())); 4051 thrust::for_each(zibit, zieit, VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 4052 } 4053 } else { 4054 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 4055 auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()); 4056 thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), matrix->values->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>()); 4057 } else { 4058 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin())); 4059 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end())); 4060 thrust::for_each(zibit, zieit, VecCUDAEquals()); 4061 } 4062 } 4063 PetscCall(PetscLogGpuTimeEnd()); 4064 finalize: 4065 delete cooPerm_v; 4066 A->offloadmask = PETSC_OFFLOAD_GPU; 4067 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4068 /* shorter version of MatAssemblyEnd_SeqAIJ */ 4069 PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", A->rmap->n, A->cmap->n, a->nz)); 4070 PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is 0\n")); 4071 PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rmax)); 4072 a->reallocs = 0; 4073 A->info.mallocs += 0; 4074 A->info.nz_unneeded = 0; 4075 A->assembled = A->was_assembled = PETSC_TRUE; 4076 A->num_ass++; 4077 PetscFunctionReturn(0); 4078 } 4079 4080 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) { 4081 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4082 4083 PetscFunctionBegin; 4084 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4085 if (!cusp) PetscFunctionReturn(0); 4086 if (destroy) { 4087 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 4088 delete cusp->csr2csc_i; 4089 cusp->csr2csc_i = NULL; 4090 } 4091 A->transupdated = PETSC_FALSE; 4092 PetscFunctionReturn(0); 4093 } 4094 4095 #include <thrust/binary_search.h> 4096 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */ 4097 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, PetscInt coo_i[], PetscInt coo_j[]) { 4098 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4099 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4100 PetscInt cooPerm_n, nzr = 0; 4101 4102 PetscFunctionBegin; 4103 PetscCall(PetscLayoutSetUp(A->rmap)); 4104 PetscCall(PetscLayoutSetUp(A->cmap)); 4105 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 4106 if (n != cooPerm_n) { 4107 delete cusp->cooPerm; 4108 delete cusp->cooPerm_a; 4109 cusp->cooPerm = NULL; 4110 cusp->cooPerm_a = NULL; 4111 } 4112 if (n) { 4113 thrust::device_ptr<PetscInt> d_i, d_j; 4114 PetscInt *d_raw_i, *d_raw_j; 4115 PetscBool free_raw_i = PETSC_FALSE, free_raw_j = PETSC_FALSE; 4116 PetscMemType imtype, jmtype; 4117 4118 PetscCall(PetscGetMemType(coo_i, &imtype)); 4119 if (PetscMemTypeHost(imtype)) { 4120 PetscCallCUDA(cudaMalloc(&d_raw_i, sizeof(PetscInt) * n)); 4121 PetscCallCUDA(cudaMemcpy(d_raw_i, coo_i, sizeof(PetscInt) * n, cudaMemcpyHostToDevice)); 4122 d_i = thrust::device_pointer_cast(d_raw_i); 4123 free_raw_i = PETSC_TRUE; 4124 PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt))); 4125 } else { 4126 d_i = thrust::device_pointer_cast(coo_i); 4127 } 4128 4129 PetscCall(PetscGetMemType(coo_j, &jmtype)); 4130 if (PetscMemTypeHost(jmtype)) { // MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic() passes device coo_i[] and host coo_j[]! 4131 PetscCallCUDA(cudaMalloc(&d_raw_j, sizeof(PetscInt) * n)); 4132 PetscCallCUDA(cudaMemcpy(d_raw_j, coo_j, sizeof(PetscInt) * n, cudaMemcpyHostToDevice)); 4133 d_j = thrust::device_pointer_cast(d_raw_j); 4134 free_raw_j = PETSC_TRUE; 4135 PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt))); 4136 } else { 4137 d_j = thrust::device_pointer_cast(coo_j); 4138 } 4139 4140 THRUSTINTARRAY ii(A->rmap->n); 4141 4142 if (!cusp->cooPerm) cusp->cooPerm = new THRUSTINTARRAY(n); 4143 if (!cusp->cooPerm_a) cusp->cooPerm_a = new THRUSTINTARRAY(n); 4144 4145 /* Ex. 4146 n = 6 4147 coo_i = [3,3,1,4,1,4] 4148 coo_j = [3,2,2,5,2,6] 4149 */ 4150 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i, d_j)); 4151 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i + n, d_j + n)); 4152 4153 PetscCall(PetscLogGpuTimeBegin()); 4154 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 4155 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 4156 (*cusp->cooPerm_a).assign(d_i, d_i + n); /* copy the sorted array */ 4157 THRUSTINTARRAY w(d_j, d_j + n); 4158 4159 /* 4160 d_i = [1,1,3,3,4,4] 4161 d_j = [2,2,2,3,5,6] 4162 cooPerm = [2,4,1,0,3,5] 4163 */ 4164 auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 4165 4166 /* 4167 d_i = [1,3,3,4,4,x] 4168 ^ekey 4169 d_j = [2,2,3,5,6,x] 4170 ^nekye 4171 */ 4172 if (nekey == ekey) { /* all entries are unique */ 4173 delete cusp->cooPerm_a; 4174 cusp->cooPerm_a = NULL; 4175 } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 4176 /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 4177 adjacent_difference(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 4178 adjacent_difference(w.begin(), w.end(), w.begin(), IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 4179 (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 4180 w[0] = 0; 4181 thrust::transform(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), w.begin(), cusp->cooPerm_a->begin(), IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 4182 thrust::inclusive_scan(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 4183 } 4184 thrust::counting_iterator<PetscInt> search_begin(0); 4185 thrust::upper_bound(d_i, nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 4186 search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 4187 ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 4188 PetscCall(PetscLogGpuTimeEnd()); 4189 4190 PetscCall(MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i)); 4191 a->singlemalloc = PETSC_FALSE; 4192 a->free_a = PETSC_TRUE; 4193 a->free_ij = PETSC_TRUE; 4194 PetscCall(PetscMalloc1(A->rmap->n + 1, &a->i)); 4195 a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 4196 PetscCallCUDA(cudaMemcpy(a->i + 1, ii.data().get(), A->rmap->n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4197 a->nz = a->maxnz = a->i[A->rmap->n]; 4198 a->rmax = 0; 4199 PetscCall(PetscMalloc1(a->nz, &a->a)); 4200 PetscCall(PetscMalloc1(a->nz, &a->j)); 4201 PetscCallCUDA(cudaMemcpy(a->j, thrust::raw_pointer_cast(d_j), a->nz * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4202 if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n, &a->ilen)); 4203 if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n, &a->imax)); 4204 for (PetscInt i = 0; i < A->rmap->n; i++) { 4205 const PetscInt nnzr = a->i[i + 1] - a->i[i]; 4206 nzr += (PetscInt) !!(nnzr); 4207 a->ilen[i] = a->imax[i] = nnzr; 4208 a->rmax = PetscMax(a->rmax, nnzr); 4209 } 4210 a->nonzerorowcnt = nzr; 4211 A->preallocated = PETSC_TRUE; 4212 PetscCall(PetscLogGpuToCpu((A->rmap->n + a->nz) * sizeof(PetscInt))); 4213 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 4214 if (free_raw_i) PetscCallCUDA(cudaFree(d_raw_i)); 4215 if (free_raw_j) PetscCallCUDA(cudaFree(d_raw_j)); 4216 } else { 4217 PetscCall(MatSeqAIJSetPreallocation(A, 0, NULL)); 4218 } 4219 PetscCall(MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE)); 4220 4221 /* We want to allocate the CUSPARSE struct for matvec now. 4222 The code is so convoluted now that I prefer to copy zeros */ 4223 PetscCall(PetscArrayzero(a->a, a->nz)); 4224 PetscCall(MatCheckCompressedRow(A, nzr, &a->compressedrow, a->i, A->rmap->n, 0.6)); 4225 A->offloadmask = PETSC_OFFLOAD_CPU; 4226 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4227 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 4228 PetscFunctionReturn(0); 4229 } 4230 4231 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) { 4232 Mat_SeqAIJ *seq; 4233 Mat_SeqAIJCUSPARSE *dev; 4234 PetscBool coo_basic = PETSC_TRUE; 4235 PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 4236 4237 PetscFunctionBegin; 4238 PetscCall(MatResetPreallocationCOO_SeqAIJ(mat)); 4239 PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat)); 4240 if (coo_i) { 4241 PetscCall(PetscGetMemType(coo_i, &mtype)); 4242 if (PetscMemTypeHost(mtype)) { 4243 for (PetscCount k = 0; k < coo_n; k++) { 4244 if (coo_i[k] < 0 || coo_j[k] < 0) { 4245 coo_basic = PETSC_FALSE; 4246 break; 4247 } 4248 } 4249 } 4250 } 4251 4252 if (coo_basic) { /* i,j are on device or do not contain negative indices */ 4253 PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j)); 4254 } else { 4255 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 4256 mat->offloadmask = PETSC_OFFLOAD_CPU; 4257 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat)); 4258 seq = static_cast<Mat_SeqAIJ *>(mat->data); 4259 dev = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr); 4260 PetscCallCUDA(cudaMalloc((void **)&dev->jmap_d, (seq->nz + 1) * sizeof(PetscCount))); 4261 PetscCallCUDA(cudaMemcpy(dev->jmap_d, seq->jmap, (seq->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4262 PetscCallCUDA(cudaMalloc((void **)&dev->perm_d, seq->Atot * sizeof(PetscCount))); 4263 PetscCallCUDA(cudaMemcpy(dev->perm_d, seq->perm, seq->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4264 dev->use_extended_coo = PETSC_TRUE; 4265 } 4266 PetscFunctionReturn(0); 4267 } 4268 4269 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[]) { 4270 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 4271 const PetscCount grid_size = gridDim.x * blockDim.x; 4272 for (; i < nnz; i += grid_size) { 4273 PetscScalar sum = 0.0; 4274 for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]]; 4275 a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum; 4276 } 4277 } 4278 4279 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) { 4280 Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data; 4281 Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE *)A->spptr; 4282 PetscCount Annz = seq->nz; 4283 PetscMemType memtype; 4284 const PetscScalar *v1 = v; 4285 PetscScalar *Aa; 4286 4287 PetscFunctionBegin; 4288 if (dev->use_extended_coo) { 4289 PetscCall(PetscGetMemType(v, &memtype)); 4290 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 4291 PetscCallCUDA(cudaMalloc((void **)&v1, seq->coo_n * sizeof(PetscScalar))); 4292 PetscCallCUDA(cudaMemcpy((void *)v1, v, seq->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4293 } 4294 4295 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); 4296 else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); 4297 4298 if (Annz) { 4299 MatAddCOOValues<<<(Annz + 255) / 256, 256>>>(v1, Annz, dev->jmap_d, dev->perm_d, imode, Aa); 4300 PetscCallCUDA(cudaPeekAtLastError()); 4301 } 4302 4303 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa)); 4304 else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa)); 4305 4306 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1)); 4307 } else { 4308 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A, v, imode)); 4309 } 4310 PetscFunctionReturn(0); 4311 } 4312 4313 /*@C 4314 MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for `MATSEQAIJCUSPARSE` matrices. 4315 4316 Not collective 4317 4318 Input Parameters: 4319 + A - the matrix 4320 - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 4321 4322 Output Parameters: 4323 + ia - the CSR row pointers 4324 - ja - the CSR column indices 4325 4326 Level: developer 4327 4328 Note: 4329 When compressed is true, the CSR structure does not contain empty rows 4330 4331 .seealso: `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()` 4332 @*/ 4333 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j) { 4334 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4335 CsrMatrix *csr; 4336 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4337 4338 PetscFunctionBegin; 4339 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4340 if (!i || !j) PetscFunctionReturn(0); 4341 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4342 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4343 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4344 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4345 csr = (CsrMatrix *)cusp->mat->mat; 4346 if (i) { 4347 if (!compressed && a->compressedrow.use) { /* need full row offset */ 4348 if (!cusp->rowoffsets_gpu) { 4349 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4350 cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 4351 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4352 } 4353 *i = cusp->rowoffsets_gpu->data().get(); 4354 } else *i = csr->row_offsets->data().get(); 4355 } 4356 if (j) *j = csr->column_indices->data().get(); 4357 PetscFunctionReturn(0); 4358 } 4359 4360 /*@C 4361 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with `MatSeqAIJCUSPARSEGetIJ()` 4362 4363 Not collective 4364 4365 Input Parameters: 4366 + A - the matrix 4367 - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 4368 4369 Output Parameters: 4370 + ia - the CSR row pointers 4371 - ja - the CSR column indices 4372 4373 Level: developer 4374 4375 .seealso: `MatSeqAIJCUSPARSEGetIJ()` 4376 @*/ 4377 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j) { 4378 PetscFunctionBegin; 4379 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4380 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4381 if (i) *i = NULL; 4382 if (j) *j = NULL; 4383 PetscFunctionReturn(0); 4384 } 4385 4386 /*@C 4387 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4388 4389 Not Collective 4390 4391 Input Parameter: 4392 . A - a `MATSEQAIJCUSPARSE` matrix 4393 4394 Output Parameter: 4395 . a - pointer to the device data 4396 4397 Level: developer 4398 4399 Note: 4400 May trigger host-device copies if up-to-date matrix data is on host 4401 4402 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()` 4403 @*/ 4404 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a) { 4405 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4406 CsrMatrix *csr; 4407 4408 PetscFunctionBegin; 4409 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4410 PetscValidPointer(a, 2); 4411 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4412 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4413 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4414 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4415 csr = (CsrMatrix *)cusp->mat->mat; 4416 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4417 *a = csr->values->data().get(); 4418 PetscFunctionReturn(0); 4419 } 4420 4421 /*@C 4422 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()` 4423 4424 Not Collective 4425 4426 Input Parameter: 4427 . A - a `MATSEQAIJCUSPARSE` matrix 4428 4429 Output Parameter: 4430 . a - pointer to the device data 4431 4432 Level: developer 4433 4434 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()` 4435 @*/ 4436 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a) { 4437 PetscFunctionBegin; 4438 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4439 PetscValidPointer(a, 2); 4440 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4441 *a = NULL; 4442 PetscFunctionReturn(0); 4443 } 4444 4445 /*@C 4446 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4447 4448 Not Collective 4449 4450 Input Parameter: 4451 . A - a `MATSEQAIJCUSPARSE` matrix 4452 4453 Output Parameter: 4454 . a - pointer to the device data 4455 4456 Level: developer 4457 4458 Note: 4459 May trigger host-device copies if up-to-date matrix data is on host 4460 4461 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()` 4462 @*/ 4463 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a) { 4464 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4465 CsrMatrix *csr; 4466 4467 PetscFunctionBegin; 4468 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4469 PetscValidPointer(a, 2); 4470 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4471 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4472 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4473 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4474 csr = (CsrMatrix *)cusp->mat->mat; 4475 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4476 *a = csr->values->data().get(); 4477 A->offloadmask = PETSC_OFFLOAD_GPU; 4478 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4479 PetscFunctionReturn(0); 4480 } 4481 /*@C 4482 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()` 4483 4484 Not Collective 4485 4486 Input Parameter: 4487 . A - a `MATSEQAIJCUSPARSE` matrix 4488 4489 Output Parameter: 4490 . a - pointer to the device data 4491 4492 Level: developer 4493 4494 .seealso: `MatSeqAIJCUSPARSEGetArray()` 4495 @*/ 4496 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a) { 4497 PetscFunctionBegin; 4498 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4499 PetscValidPointer(a, 2); 4500 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4501 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4502 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4503 *a = NULL; 4504 PetscFunctionReturn(0); 4505 } 4506 4507 /*@C 4508 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4509 4510 Not Collective 4511 4512 Input Parameter: 4513 . A - a `MATSEQAIJCUSPARSE` matrix 4514 4515 Output Parameter: 4516 . a - pointer to the device data 4517 4518 Level: developer 4519 4520 Note: 4521 Does not trigger host-device copies and flags data validity on the GPU 4522 4523 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()` 4524 @*/ 4525 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a) { 4526 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4527 CsrMatrix *csr; 4528 4529 PetscFunctionBegin; 4530 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4531 PetscValidPointer(a, 2); 4532 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4533 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4534 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4535 csr = (CsrMatrix *)cusp->mat->mat; 4536 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4537 *a = csr->values->data().get(); 4538 A->offloadmask = PETSC_OFFLOAD_GPU; 4539 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4540 PetscFunctionReturn(0); 4541 } 4542 4543 /*@C 4544 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()` 4545 4546 Not Collective 4547 4548 Input Parameter: 4549 . A - a `MATSEQAIJCUSPARSE` matrix 4550 4551 Output Parameter: 4552 . a - pointer to the device data 4553 4554 Level: developer 4555 4556 .seealso: `MatSeqAIJCUSPARSEGetArrayWrite()` 4557 @*/ 4558 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a) { 4559 PetscFunctionBegin; 4560 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4561 PetscValidPointer(a, 2); 4562 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4563 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4564 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4565 *a = NULL; 4566 PetscFunctionReturn(0); 4567 } 4568 4569 struct IJCompare4 { 4570 __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) { 4571 if (t1.get<0>() < t2.get<0>()) return true; 4572 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4573 return false; 4574 } 4575 }; 4576 4577 struct Shift { 4578 int _shift; 4579 4580 Shift(int shift) : _shift(shift) { } 4581 __host__ __device__ inline int operator()(const int &c) { return c + _shift; } 4582 }; 4583 4584 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 4585 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) { 4586 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 4587 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp; 4588 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4589 CsrMatrix *Acsr, *Bcsr, *Ccsr; 4590 PetscInt Annz, Bnnz; 4591 cusparseStatus_t stat; 4592 PetscInt i, m, n, zero = 0; 4593 4594 PetscFunctionBegin; 4595 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4596 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 4597 PetscValidPointer(C, 4); 4598 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4599 PetscCheckTypeName(B, MATSEQAIJCUSPARSE); 4600 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 4601 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 4602 PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4603 PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4604 if (reuse == MAT_INITIAL_MATRIX) { 4605 m = A->rmap->n; 4606 n = A->cmap->n + B->cmap->n; 4607 PetscCall(MatCreate(PETSC_COMM_SELF, C)); 4608 PetscCall(MatSetSizes(*C, m, n, m, n)); 4609 PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE)); 4610 c = (Mat_SeqAIJ *)(*C)->data; 4611 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4612 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4613 Ccsr = new CsrMatrix; 4614 Cmat->cprowIndices = NULL; 4615 c->compressedrow.use = PETSC_FALSE; 4616 c->compressedrow.nrows = 0; 4617 c->compressedrow.i = NULL; 4618 c->compressedrow.rindex = NULL; 4619 Ccusp->workVector = NULL; 4620 Ccusp->nrows = m; 4621 Ccusp->mat = Cmat; 4622 Ccusp->mat->mat = Ccsr; 4623 Ccsr->num_rows = m; 4624 Ccsr->num_cols = n; 4625 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 4626 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 4627 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4628 PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar))); 4629 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar))); 4630 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 4631 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4632 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4633 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4634 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4635 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4636 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4637 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4638 4639 Acsr = (CsrMatrix *)Acusp->mat->mat; 4640 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4641 Annz = (PetscInt)Acsr->column_indices->size(); 4642 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4643 c->nz = Annz + Bnnz; 4644 Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1); 4645 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4646 Ccsr->values = new THRUSTARRAY(c->nz); 4647 Ccsr->num_entries = c->nz; 4648 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4649 if (c->nz) { 4650 auto Acoo = new THRUSTINTARRAY32(Annz); 4651 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4652 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4653 THRUSTINTARRAY32 *Aroff, *Broff; 4654 4655 if (a->compressedrow.use) { /* need full row offset */ 4656 if (!Acusp->rowoffsets_gpu) { 4657 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4658 Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 4659 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4660 } 4661 Aroff = Acusp->rowoffsets_gpu; 4662 } else Aroff = Acsr->row_offsets; 4663 if (b->compressedrow.use) { /* need full row offset */ 4664 if (!Bcusp->rowoffsets_gpu) { 4665 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4666 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 4667 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 4668 } 4669 Broff = Bcusp->rowoffsets_gpu; 4670 } else Broff = Bcsr->row_offsets; 4671 PetscCall(PetscLogGpuTimeBegin()); 4672 stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4673 PetscCallCUSPARSE(stat); 4674 stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4675 PetscCallCUSPARSE(stat); 4676 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4677 auto Aperm = thrust::make_constant_iterator(1); 4678 auto Bperm = thrust::make_constant_iterator(0); 4679 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0) 4680 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n)); 4681 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n)); 4682 #else 4683 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4684 auto Bcib = Bcsr->column_indices->begin(); 4685 auto Bcie = Bcsr->column_indices->end(); 4686 thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n)); 4687 #endif 4688 auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz); 4689 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm)); 4690 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm)); 4691 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm)); 4692 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm)); 4693 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin())); 4694 auto p1 = Ccusp->cooPerm->begin(); 4695 auto p2 = Ccusp->cooPerm->begin(); 4696 thrust::advance(p2, Annz); 4697 PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4())); 4698 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0) 4699 thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n)); 4700 #endif 4701 auto cci = thrust::make_counting_iterator(zero); 4702 auto cce = thrust::make_counting_iterator(c->nz); 4703 #if 0 //Errors on SUMMIT cuda 11.1.0 4704 PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4705 #else 4706 auto pred = thrust::identity<int>(); 4707 PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred)); 4708 PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred)); 4709 #endif 4710 stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4711 PetscCallCUSPARSE(stat); 4712 PetscCall(PetscLogGpuTimeEnd()); 4713 delete wPerm; 4714 delete Acoo; 4715 delete Bcoo; 4716 delete Ccoo; 4717 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4718 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 4719 PetscCallCUSPARSE(stat); 4720 #endif 4721 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4722 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 4723 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 4724 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4725 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4726 CsrMatrix *CcsrT = new CsrMatrix; 4727 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4728 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4729 4730 (*C)->form_explicit_transpose = PETSC_TRUE; 4731 (*C)->transupdated = PETSC_TRUE; 4732 Ccusp->rowoffsets_gpu = NULL; 4733 CmatT->cprowIndices = NULL; 4734 CmatT->mat = CcsrT; 4735 CcsrT->num_rows = n; 4736 CcsrT->num_cols = m; 4737 CcsrT->num_entries = c->nz; 4738 4739 CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1); 4740 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4741 CcsrT->values = new THRUSTARRAY(c->nz); 4742 4743 PetscCall(PetscLogGpuTimeBegin()); 4744 auto rT = CcsrT->row_offsets->begin(); 4745 if (AT) { 4746 rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT); 4747 thrust::advance(rT, -1); 4748 } 4749 if (BT) { 4750 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz)); 4751 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz)); 4752 thrust::copy(titb, tite, rT); 4753 } 4754 auto cT = CcsrT->column_indices->begin(); 4755 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT); 4756 if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT); 4757 auto vT = CcsrT->values->begin(); 4758 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4759 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 4760 PetscCall(PetscLogGpuTimeEnd()); 4761 4762 PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr)); 4763 PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO)); 4764 PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4765 PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one), sizeof(PetscScalar))); 4766 PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero), sizeof(PetscScalar))); 4767 PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar))); 4768 PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4769 PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4770 PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4771 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4772 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 4773 PetscCallCUSPARSE(stat); 4774 #endif 4775 Ccusp->matTranspose = CmatT; 4776 } 4777 } 4778 4779 c->singlemalloc = PETSC_FALSE; 4780 c->free_a = PETSC_TRUE; 4781 c->free_ij = PETSC_TRUE; 4782 PetscCall(PetscMalloc1(m + 1, &c->i)); 4783 PetscCall(PetscMalloc1(c->nz, &c->j)); 4784 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4785 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4786 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4787 ii = *Ccsr->row_offsets; 4788 jj = *Ccsr->column_indices; 4789 PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4790 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4791 } else { 4792 PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4793 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4794 } 4795 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 4796 PetscCall(PetscMalloc1(m, &c->ilen)); 4797 PetscCall(PetscMalloc1(m, &c->imax)); 4798 c->maxnz = c->nz; 4799 c->nonzerorowcnt = 0; 4800 c->rmax = 0; 4801 for (i = 0; i < m; i++) { 4802 const PetscInt nn = c->i[i + 1] - c->i[i]; 4803 c->ilen[i] = c->imax[i] = nn; 4804 c->nonzerorowcnt += (PetscInt) !!nn; 4805 c->rmax = PetscMax(c->rmax, nn); 4806 } 4807 PetscCall(MatMarkDiagonal_SeqAIJ(*C)); 4808 PetscCall(PetscMalloc1(c->nz, &c->a)); 4809 (*C)->nonzerostate++; 4810 PetscCall(PetscLayoutSetUp((*C)->rmap)); 4811 PetscCall(PetscLayoutSetUp((*C)->cmap)); 4812 Ccusp->nonzerostate = (*C)->nonzerostate; 4813 (*C)->preallocated = PETSC_TRUE; 4814 } else { 4815 PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n); 4816 c = (Mat_SeqAIJ *)(*C)->data; 4817 if (c->nz) { 4818 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4819 PetscCheck(Ccusp->cooPerm, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cooPerm"); 4820 PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4821 PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate"); 4822 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4823 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4824 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4825 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4826 Acsr = (CsrMatrix *)Acusp->mat->mat; 4827 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4828 Ccsr = (CsrMatrix *)Ccusp->mat->mat; 4829 PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size()); 4830 PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size()); 4831 PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size()); 4832 PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries); 4833 PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->cooPerm->size(), (PetscInt)Ccsr->values->size()); 4834 auto pmid = Ccusp->cooPerm->begin(); 4835 thrust::advance(pmid, Acsr->num_entries); 4836 PetscCall(PetscLogGpuTimeBegin()); 4837 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->begin()))); 4838 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4839 thrust::for_each(zibait, zieait, VecCUDAEquals()); 4840 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4841 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->end()))); 4842 thrust::for_each(zibbit, ziebit, VecCUDAEquals()); 4843 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE)); 4844 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4845 PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4846 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4847 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4848 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4849 CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat; 4850 auto vT = CcsrT->values->begin(); 4851 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4852 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 4853 (*C)->transupdated = PETSC_TRUE; 4854 } 4855 PetscCall(PetscLogGpuTimeEnd()); 4856 } 4857 } 4858 PetscCall(PetscObjectStateIncrease((PetscObject)*C)); 4859 (*C)->assembled = PETSC_TRUE; 4860 (*C)->was_assembled = PETSC_FALSE; 4861 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4862 PetscFunctionReturn(0); 4863 } 4864 4865 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) { 4866 bool dmem; 4867 const PetscScalar *av; 4868 4869 PetscFunctionBegin; 4870 dmem = isCudaMem(v); 4871 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av)); 4872 if (n && idx) { 4873 THRUSTINTARRAY widx(n); 4874 widx.assign(idx, idx + n); 4875 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 4876 4877 THRUSTARRAY *w = NULL; 4878 thrust::device_ptr<PetscScalar> dv; 4879 if (dmem) { 4880 dv = thrust::device_pointer_cast(v); 4881 } else { 4882 w = new THRUSTARRAY(n); 4883 dv = w->data(); 4884 } 4885 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4886 4887 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv)); 4888 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n)); 4889 thrust::for_each(zibit, zieit, VecCUDAEquals()); 4890 if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 4891 delete w; 4892 } else { 4893 PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost)); 4894 } 4895 if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 4896 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av)); 4897 PetscFunctionReturn(0); 4898 } 4899