1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 6 7 #include <petscconf.h> 8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 9 #include <../src/mat/impls/sbaij/seq/sbaij.h> 10 #include <../src/vec/vec/impls/dvecimpl.h> 11 #include <petsc/private/vecimpl.h> 12 #undef VecType 13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 14 #include <thrust/adjacent_difference.h> 15 #if PETSC_CPP_VERSION >= 14 16 #define PETSC_HAVE_THRUST_ASYNC 1 17 // thrust::for_each(thrust::cuda::par.on()) requires C++14 18 #include <thrust/async/for_each.h> 19 #endif 20 #include <thrust/iterator/constant_iterator.h> 21 #include <thrust/remove.h> 22 #include <thrust/sort.h> 23 #include <thrust/unique.h> 24 25 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0}; 26 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 27 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 28 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 29 30 typedef enum { 31 CUSPARSE_MV_ALG_DEFAULT = 0, 32 CUSPARSE_COOMV_ALG = 1, 33 CUSPARSE_CSRMV_ALG1 = 2, 34 CUSPARSE_CSRMV_ALG2 = 3 35 } cusparseSpMVAlg_t; 36 37 typedef enum { 38 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 39 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 40 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 41 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 42 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 43 CUSPARSE_SPMM_ALG_DEFAULT = 0, 44 CUSPARSE_SPMM_COO_ALG1 = 1, 45 CUSPARSE_SPMM_COO_ALG2 = 2, 46 CUSPARSE_SPMM_COO_ALG3 = 3, 47 CUSPARSE_SPMM_COO_ALG4 = 5, 48 CUSPARSE_SPMM_CSR_ALG1 = 4, 49 CUSPARSE_SPMM_CSR_ALG2 = 6, 50 } cusparseSpMMAlg_t; 51 52 typedef enum { 53 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic 54 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministic 55 } cusparseCsr2CscAlg_t; 56 */ 57 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0}; 58 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0}; 59 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0}; 60 #endif 61 62 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 63 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 64 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *); 65 66 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *); 67 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *); 68 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *); 69 70 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec); 71 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 72 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 73 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 74 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject); 75 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure); 76 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar); 77 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec); 78 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 79 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 80 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 81 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 82 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 83 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool); 84 85 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **); 86 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **); 87 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat); 88 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **); 89 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **); 90 91 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 92 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool); 93 94 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]); 95 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]); 96 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode); 97 98 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 99 { 100 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 101 102 PetscFunctionBegin; 103 switch (op) { 104 case MAT_CUSPARSE_MULT: 105 cusparsestruct->format = format; 106 break; 107 case MAT_CUSPARSE_ALL: 108 cusparsestruct->format = format; 109 break; 110 default: 111 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op); 112 } 113 PetscFunctionReturn(PETSC_SUCCESS); 114 } 115 116 /*@ 117 MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular 118 operation. Only the `MatMult()` operation can use different GPU storage formats 119 120 Not Collective 121 122 Input Parameters: 123 + A - Matrix of type `MATSEQAIJCUSPARSE` 124 . op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`. `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`, 125 `MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`. 126 - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.) 127 128 Output Parameter: 129 130 Level: intermediate 131 132 .seealso: `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 133 @*/ 134 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 135 { 136 PetscFunctionBegin; 137 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 138 PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format)); 139 PetscFunctionReturn(PETSC_SUCCESS); 140 } 141 142 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu) 143 { 144 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 145 146 PetscFunctionBegin; 147 cusparsestruct->use_cpu_solve = use_cpu; 148 PetscFunctionReturn(PETSC_SUCCESS); 149 } 150 151 /*@ 152 MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`. 153 154 Input Parameters: 155 + A - Matrix of type `MATSEQAIJCUSPARSE` 156 - use_cpu - set flag for using the built-in CPU `MatSolve()` 157 158 Output Parameter: 159 160 Note: 161 The cuSparse LU solver currently computes the factors with the built-in CPU method 162 and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 163 This method to specify if the solve is done on the CPU or GPU (GPU is the default). 164 165 Level: intermediate 166 167 .seealso: `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 168 @*/ 169 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu) 170 { 171 PetscFunctionBegin; 172 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 173 PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu)); 174 PetscFunctionReturn(PETSC_SUCCESS); 175 } 176 177 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg) 178 { 179 PetscFunctionBegin; 180 switch (op) { 181 case MAT_FORM_EXPLICIT_TRANSPOSE: 182 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 183 if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 184 A->form_explicit_transpose = flg; 185 break; 186 default: 187 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 188 break; 189 } 190 PetscFunctionReturn(PETSC_SUCCESS); 191 } 192 193 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 194 195 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 196 { 197 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 198 IS isrow = b->row, iscol = b->col; 199 PetscBool row_identity, col_identity; 200 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)B->spptr; 201 202 PetscFunctionBegin; 203 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 204 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 205 B->offloadmask = PETSC_OFFLOAD_CPU; 206 /* determine which version of MatSolve needs to be used. */ 207 PetscCall(ISIdentity(isrow, &row_identity)); 208 PetscCall(ISIdentity(iscol, &col_identity)); 209 210 if (!cusparsestruct->use_cpu_solve) { 211 if (row_identity && col_identity) { 212 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 213 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 214 } else { 215 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 216 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 217 } 218 } 219 B->ops->matsolve = NULL; 220 B->ops->matsolvetranspose = NULL; 221 222 /* get the triangular factors */ 223 if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B)); 224 PetscFunctionReturn(PETSC_SUCCESS); 225 } 226 227 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject) 228 { 229 MatCUSPARSEStorageFormat format; 230 PetscBool flg; 231 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 232 233 PetscFunctionBegin; 234 PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options"); 235 if (A->factortype == MAT_FACTOR_NONE) { 236 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 237 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format)); 238 239 PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 240 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format)); 241 PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg)); 242 if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve)); 243 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 244 PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg)); 245 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 246 #if CUSPARSE_VERSION > 11301 247 PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 248 #else 249 PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 250 #endif 251 PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg)); 252 PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 253 254 PetscCall( 255 PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg)); 256 PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 257 #endif 258 } 259 PetscOptionsHeadEnd(); 260 PetscFunctionReturn(PETSC_SUCCESS); 261 } 262 263 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 264 { 265 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 266 PetscInt n = A->rmap->n; 267 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 268 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 269 const PetscInt *ai = a->i, *aj = a->j, *vi; 270 const MatScalar *aa = a->a, *v; 271 PetscInt *AiLo, *AjLo; 272 PetscInt i, nz, nzLower, offset, rowOffset; 273 274 PetscFunctionBegin; 275 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 276 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 277 try { 278 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 279 nzLower = n + ai[n] - ai[1]; 280 if (!loTriFactor) { 281 PetscScalar *AALo; 282 283 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar))); 284 285 /* Allocate Space for the lower triangular matrix */ 286 PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt))); 287 PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt))); 288 289 /* Fill the lower triangular matrix */ 290 AiLo[0] = (PetscInt)0; 291 AiLo[n] = nzLower; 292 AjLo[0] = (PetscInt)0; 293 AALo[0] = (MatScalar)1.0; 294 v = aa; 295 vi = aj; 296 offset = 1; 297 rowOffset = 1; 298 for (i = 1; i < n; i++) { 299 nz = ai[i + 1] - ai[i]; 300 /* additional 1 for the term on the diagonal */ 301 AiLo[i] = rowOffset; 302 rowOffset += nz + 1; 303 304 PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz)); 305 PetscCall(PetscArraycpy(&(AALo[offset]), v, nz)); 306 307 offset += nz; 308 AjLo[offset] = (PetscInt)i; 309 AALo[offset] = (MatScalar)1.0; 310 offset += 1; 311 312 v += nz; 313 vi += nz; 314 } 315 316 /* allocate space for the triangular factor information */ 317 PetscCall(PetscNew(&loTriFactor)); 318 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 319 /* Create the matrix description */ 320 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 321 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 322 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 323 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 324 #else 325 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 326 #endif 327 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER)); 328 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 329 330 /* set the operation */ 331 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 332 333 /* set the matrix */ 334 loTriFactor->csrMat = new CsrMatrix; 335 loTriFactor->csrMat->num_rows = n; 336 loTriFactor->csrMat->num_cols = n; 337 loTriFactor->csrMat->num_entries = nzLower; 338 339 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 340 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1); 341 342 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 343 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower); 344 345 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 346 loTriFactor->csrMat->values->assign(AALo, AALo + nzLower); 347 348 /* Create the solve analysis information */ 349 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 350 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 351 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 352 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 353 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 354 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 355 #endif 356 357 /* perform the solve analysis */ 358 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 359 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 360 PetscCallCUDA(WaitForCUDA()); 361 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 362 363 /* assign the pointer */ 364 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 365 loTriFactor->AA_h = AALo; 366 PetscCallCUDA(cudaFreeHost(AiLo)); 367 PetscCallCUDA(cudaFreeHost(AjLo)); 368 PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar))); 369 } else { /* update values only */ 370 if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar))); 371 /* Fill the lower triangular matrix */ 372 loTriFactor->AA_h[0] = 1.0; 373 v = aa; 374 vi = aj; 375 offset = 1; 376 for (i = 1; i < n; i++) { 377 nz = ai[i + 1] - ai[i]; 378 PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz)); 379 offset += nz; 380 loTriFactor->AA_h[offset] = 1.0; 381 offset += 1; 382 v += nz; 383 } 384 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower); 385 PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar))); 386 } 387 } catch (char *ex) { 388 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 389 } 390 } 391 PetscFunctionReturn(PETSC_SUCCESS); 392 } 393 394 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 395 { 396 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 397 PetscInt n = A->rmap->n; 398 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 399 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 400 const PetscInt *aj = a->j, *adiag = a->diag, *vi; 401 const MatScalar *aa = a->a, *v; 402 PetscInt *AiUp, *AjUp; 403 PetscInt i, nz, nzUpper, offset; 404 405 PetscFunctionBegin; 406 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 407 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 408 try { 409 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 410 nzUpper = adiag[0] - adiag[n]; 411 if (!upTriFactor) { 412 PetscScalar *AAUp; 413 414 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 415 416 /* Allocate Space for the upper triangular matrix */ 417 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 418 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 419 420 /* Fill the upper triangular matrix */ 421 AiUp[0] = (PetscInt)0; 422 AiUp[n] = nzUpper; 423 offset = nzUpper; 424 for (i = n - 1; i >= 0; i--) { 425 v = aa + adiag[i + 1] + 1; 426 vi = aj + adiag[i + 1] + 1; 427 428 /* number of elements NOT on the diagonal */ 429 nz = adiag[i] - adiag[i + 1] - 1; 430 431 /* decrement the offset */ 432 offset -= (nz + 1); 433 434 /* first, set the diagonal elements */ 435 AjUp[offset] = (PetscInt)i; 436 AAUp[offset] = (MatScalar)1. / v[nz]; 437 AiUp[i] = AiUp[i + 1] - (nz + 1); 438 439 PetscCall(PetscArraycpy(&(AjUp[offset + 1]), vi, nz)); 440 PetscCall(PetscArraycpy(&(AAUp[offset + 1]), v, nz)); 441 } 442 443 /* allocate space for the triangular factor information */ 444 PetscCall(PetscNew(&upTriFactor)); 445 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 446 447 /* Create the matrix description */ 448 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 449 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 450 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 451 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 452 #else 453 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 454 #endif 455 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 456 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 457 458 /* set the operation */ 459 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 460 461 /* set the matrix */ 462 upTriFactor->csrMat = new CsrMatrix; 463 upTriFactor->csrMat->num_rows = n; 464 upTriFactor->csrMat->num_cols = n; 465 upTriFactor->csrMat->num_entries = nzUpper; 466 467 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 468 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1); 469 470 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 471 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper); 472 473 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 474 upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper); 475 476 /* Create the solve analysis information */ 477 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 478 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 479 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 480 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 481 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 482 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 483 #endif 484 485 /* perform the solve analysis */ 486 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 487 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 488 489 PetscCallCUDA(WaitForCUDA()); 490 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 491 492 /* assign the pointer */ 493 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 494 upTriFactor->AA_h = AAUp; 495 PetscCallCUDA(cudaFreeHost(AiUp)); 496 PetscCallCUDA(cudaFreeHost(AjUp)); 497 PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar))); 498 } else { 499 if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar))); 500 /* Fill the upper triangular matrix */ 501 offset = nzUpper; 502 for (i = n - 1; i >= 0; i--) { 503 v = aa + adiag[i + 1] + 1; 504 505 /* number of elements NOT on the diagonal */ 506 nz = adiag[i] - adiag[i + 1] - 1; 507 508 /* decrement the offset */ 509 offset -= (nz + 1); 510 511 /* first, set the diagonal elements */ 512 upTriFactor->AA_h[offset] = 1. / v[nz]; 513 PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset + 1]), v, nz)); 514 } 515 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper); 516 PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar))); 517 } 518 } catch (char *ex) { 519 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 520 } 521 } 522 PetscFunctionReturn(PETSC_SUCCESS); 523 } 524 525 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 526 { 527 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 528 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 529 IS isrow = a->row, iscol = a->icol; 530 PetscBool row_identity, col_identity; 531 PetscInt n = A->rmap->n; 532 533 PetscFunctionBegin; 534 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 535 PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A)); 536 PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A)); 537 538 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 539 cusparseTriFactors->nnz = a->nz; 540 541 A->offloadmask = PETSC_OFFLOAD_BOTH; 542 /* lower triangular indices */ 543 PetscCall(ISIdentity(isrow, &row_identity)); 544 if (!row_identity && !cusparseTriFactors->rpermIndices) { 545 const PetscInt *r; 546 547 PetscCall(ISGetIndices(isrow, &r)); 548 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 549 cusparseTriFactors->rpermIndices->assign(r, r + n); 550 PetscCall(ISRestoreIndices(isrow, &r)); 551 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 552 } 553 554 /* upper triangular indices */ 555 PetscCall(ISIdentity(iscol, &col_identity)); 556 if (!col_identity && !cusparseTriFactors->cpermIndices) { 557 const PetscInt *c; 558 559 PetscCall(ISGetIndices(iscol, &c)); 560 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 561 cusparseTriFactors->cpermIndices->assign(c, c + n); 562 PetscCall(ISRestoreIndices(iscol, &c)); 563 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 564 } 565 PetscFunctionReturn(PETSC_SUCCESS); 566 } 567 568 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 569 { 570 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 571 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 572 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 573 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 574 PetscInt *AiUp, *AjUp; 575 PetscScalar *AAUp; 576 PetscScalar *AALo; 577 PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j; 578 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data; 579 const PetscInt *ai = b->i, *aj = b->j, *vj; 580 const MatScalar *aa = b->a, *v; 581 582 PetscFunctionBegin; 583 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 584 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 585 try { 586 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 587 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar))); 588 if (!upTriFactor && !loTriFactor) { 589 /* Allocate Space for the upper triangular matrix */ 590 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 591 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 592 593 /* Fill the upper triangular matrix */ 594 AiUp[0] = (PetscInt)0; 595 AiUp[n] = nzUpper; 596 offset = 0; 597 for (i = 0; i < n; i++) { 598 /* set the pointers */ 599 v = aa + ai[i]; 600 vj = aj + ai[i]; 601 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 602 603 /* first, set the diagonal elements */ 604 AjUp[offset] = (PetscInt)i; 605 AAUp[offset] = (MatScalar)1.0 / v[nz]; 606 AiUp[i] = offset; 607 AALo[offset] = (MatScalar)1.0 / v[nz]; 608 609 offset += 1; 610 if (nz > 0) { 611 PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz)); 612 PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 613 for (j = offset; j < offset + nz; j++) { 614 AAUp[j] = -AAUp[j]; 615 AALo[j] = AAUp[j] / v[nz]; 616 } 617 offset += nz; 618 } 619 } 620 621 /* allocate space for the triangular factor information */ 622 PetscCall(PetscNew(&upTriFactor)); 623 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 624 625 /* Create the matrix description */ 626 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 627 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 628 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 629 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 630 #else 631 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 632 #endif 633 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 634 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 635 636 /* set the matrix */ 637 upTriFactor->csrMat = new CsrMatrix; 638 upTriFactor->csrMat->num_rows = A->rmap->n; 639 upTriFactor->csrMat->num_cols = A->cmap->n; 640 upTriFactor->csrMat->num_entries = a->nz; 641 642 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 643 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 644 645 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 646 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 647 648 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 649 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 650 651 /* set the operation */ 652 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 653 654 /* Create the solve analysis information */ 655 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 656 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 657 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 658 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 659 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 660 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 661 #endif 662 663 /* perform the solve analysis */ 664 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 665 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 666 667 PetscCallCUDA(WaitForCUDA()); 668 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 669 670 /* assign the pointer */ 671 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 672 673 /* allocate space for the triangular factor information */ 674 PetscCall(PetscNew(&loTriFactor)); 675 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 676 677 /* Create the matrix description */ 678 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 679 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 680 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 681 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 682 #else 683 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 684 #endif 685 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 686 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 687 688 /* set the operation */ 689 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 690 691 /* set the matrix */ 692 loTriFactor->csrMat = new CsrMatrix; 693 loTriFactor->csrMat->num_rows = A->rmap->n; 694 loTriFactor->csrMat->num_cols = A->cmap->n; 695 loTriFactor->csrMat->num_entries = a->nz; 696 697 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 698 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 699 700 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 701 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 702 703 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 704 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 705 706 /* Create the solve analysis information */ 707 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 708 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 709 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 710 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 711 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 712 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 713 #endif 714 715 /* perform the solve analysis */ 716 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 717 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 718 719 PetscCallCUDA(WaitForCUDA()); 720 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 721 722 /* assign the pointer */ 723 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 724 725 PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar)))); 726 PetscCallCUDA(cudaFreeHost(AiUp)); 727 PetscCallCUDA(cudaFreeHost(AjUp)); 728 } else { 729 /* Fill the upper triangular matrix */ 730 offset = 0; 731 for (i = 0; i < n; i++) { 732 /* set the pointers */ 733 v = aa + ai[i]; 734 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 735 736 /* first, set the diagonal elements */ 737 AAUp[offset] = 1.0 / v[nz]; 738 AALo[offset] = 1.0 / v[nz]; 739 740 offset += 1; 741 if (nz > 0) { 742 PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 743 for (j = offset; j < offset + nz; j++) { 744 AAUp[j] = -AAUp[j]; 745 AALo[j] = AAUp[j] / v[nz]; 746 } 747 offset += nz; 748 } 749 } 750 PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 751 PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 752 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 753 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 754 PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar))); 755 } 756 PetscCallCUDA(cudaFreeHost(AAUp)); 757 PetscCallCUDA(cudaFreeHost(AALo)); 758 } catch (char *ex) { 759 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 760 } 761 } 762 PetscFunctionReturn(PETSC_SUCCESS); 763 } 764 765 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 766 { 767 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 768 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 769 IS ip = a->row; 770 PetscBool perm_identity; 771 PetscInt n = A->rmap->n; 772 773 PetscFunctionBegin; 774 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 775 PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A)); 776 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 777 cusparseTriFactors->nnz = (a->nz - n) * 2 + n; 778 779 A->offloadmask = PETSC_OFFLOAD_BOTH; 780 781 /* lower triangular indices */ 782 PetscCall(ISIdentity(ip, &perm_identity)); 783 if (!perm_identity) { 784 IS iip; 785 const PetscInt *irip, *rip; 786 787 PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip)); 788 PetscCall(ISGetIndices(iip, &irip)); 789 PetscCall(ISGetIndices(ip, &rip)); 790 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 791 cusparseTriFactors->rpermIndices->assign(rip, rip + n); 792 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 793 cusparseTriFactors->cpermIndices->assign(irip, irip + n); 794 PetscCall(ISRestoreIndices(iip, &irip)); 795 PetscCall(ISDestroy(&iip)); 796 PetscCall(ISRestoreIndices(ip, &rip)); 797 PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt))); 798 } 799 PetscFunctionReturn(PETSC_SUCCESS); 800 } 801 802 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 803 { 804 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 805 IS ip = b->row; 806 PetscBool perm_identity; 807 808 PetscFunctionBegin; 809 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 810 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 811 B->offloadmask = PETSC_OFFLOAD_CPU; 812 /* determine which version of MatSolve needs to be used. */ 813 PetscCall(ISIdentity(ip, &perm_identity)); 814 if (perm_identity) { 815 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 816 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 817 B->ops->matsolve = NULL; 818 B->ops->matsolvetranspose = NULL; 819 } else { 820 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 821 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 822 B->ops->matsolve = NULL; 823 B->ops->matsolvetranspose = NULL; 824 } 825 826 /* get the triangular factors */ 827 PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B)); 828 PetscFunctionReturn(PETSC_SUCCESS); 829 } 830 831 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 832 { 833 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 834 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 835 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 836 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 837 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 838 cusparseIndexBase_t indexBase; 839 cusparseMatrixType_t matrixType; 840 cusparseFillMode_t fillMode; 841 cusparseDiagType_t diagType; 842 843 PetscFunctionBegin; 844 /* allocate space for the transpose of the lower triangular factor */ 845 PetscCall(PetscNew(&loTriFactorT)); 846 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 847 848 /* set the matrix descriptors of the lower triangular factor */ 849 matrixType = cusparseGetMatType(loTriFactor->descr); 850 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 851 fillMode = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 852 diagType = cusparseGetMatDiagType(loTriFactor->descr); 853 854 /* Create the matrix description */ 855 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr)); 856 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 857 PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType)); 858 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode)); 859 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType)); 860 861 /* set the operation */ 862 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 863 864 /* allocate GPU space for the CSC of the lower triangular factor*/ 865 loTriFactorT->csrMat = new CsrMatrix; 866 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 867 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 868 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 869 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1); 870 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 871 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 872 873 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 874 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 875 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), 876 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 877 loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 878 PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize)); 879 #endif 880 881 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 882 { 883 // there is no clean way to have PetscCallCUSPARSE wrapping this function... 884 auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 885 loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), 886 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 887 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer); 888 #else 889 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 890 #endif 891 PetscCallCUSPARSE(stat); 892 } 893 894 PetscCallCUDA(WaitForCUDA()); 895 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 896 897 /* Create the solve analysis information */ 898 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 899 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo)); 900 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 901 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 902 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize)); 903 PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize)); 904 #endif 905 906 /* perform the solve analysis */ 907 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 908 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 909 910 PetscCallCUDA(WaitForCUDA()); 911 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 912 913 /* assign the pointer */ 914 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 915 916 /*********************************************/ 917 /* Now the Transpose of the Upper Tri Factor */ 918 /*********************************************/ 919 920 /* allocate space for the transpose of the upper triangular factor */ 921 PetscCall(PetscNew(&upTriFactorT)); 922 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 923 924 /* set the matrix descriptors of the upper triangular factor */ 925 matrixType = cusparseGetMatType(upTriFactor->descr); 926 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 927 fillMode = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 928 diagType = cusparseGetMatDiagType(upTriFactor->descr); 929 930 /* Create the matrix description */ 931 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr)); 932 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 933 PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType)); 934 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode)); 935 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType)); 936 937 /* set the operation */ 938 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 939 940 /* allocate GPU space for the CSC of the upper triangular factor*/ 941 upTriFactorT->csrMat = new CsrMatrix; 942 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 943 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 944 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 945 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1); 946 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 947 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 948 949 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 950 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 951 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), 952 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 953 upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 954 PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize)); 955 #endif 956 957 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 958 { 959 // there is no clean way to have PetscCallCUSPARSE wrapping this function... 960 auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 961 upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), 962 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 963 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer); 964 #else 965 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 966 #endif 967 PetscCallCUSPARSE(stat); 968 } 969 970 PetscCallCUDA(WaitForCUDA()); 971 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 972 973 /* Create the solve analysis information */ 974 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 975 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo)); 976 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 977 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 978 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize)); 979 PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize)); 980 #endif 981 982 /* perform the solve analysis */ 983 /* christ, would it have killed you to put this stuff in a function????????? */ 984 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 985 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 986 987 PetscCallCUDA(WaitForCUDA()); 988 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 989 990 /* assign the pointer */ 991 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 992 PetscFunctionReturn(PETSC_SUCCESS); 993 } 994 995 struct PetscScalarToPetscInt { 996 __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); } 997 }; 998 999 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1000 { 1001 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 1002 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1003 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1004 cusparseStatus_t stat; 1005 cusparseIndexBase_t indexBase; 1006 1007 PetscFunctionBegin; 1008 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1009 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 1010 PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct"); 1011 matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 1012 PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct"); 1013 if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS); 1014 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1015 PetscCall(PetscLogGpuTimeBegin()); 1016 if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 1017 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1018 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1019 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr)); 1020 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1021 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase)); 1022 PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1023 1024 /* set alpha and beta */ 1025 PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar))); 1026 PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero), sizeof(PetscScalar))); 1027 PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar))); 1028 PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1029 PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1030 PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1031 1032 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1033 CsrMatrix *matrixT = new CsrMatrix; 1034 matstructT->mat = matrixT; 1035 matrixT->num_rows = A->cmap->n; 1036 matrixT->num_cols = A->rmap->n; 1037 matrixT->num_entries = a->nz; 1038 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1); 1039 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1040 matrixT->values = new THRUSTARRAY(a->nz); 1041 1042 if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1043 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1044 1045 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1046 #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1) 1047 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1048 indexBase, cusparse_scalartype); 1049 PetscCallCUSPARSE(stat); 1050 #else 1051 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 1052 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 1053 1054 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 1055 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 1056 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 1057 */ 1058 if (matrixT->num_entries) { 1059 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype); 1060 PetscCallCUSPARSE(stat); 1061 1062 } else { 1063 matstructT->matDescr = NULL; 1064 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 1065 } 1066 #endif 1067 #endif 1068 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1069 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1070 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1071 #else 1072 CsrMatrix *temp = new CsrMatrix; 1073 CsrMatrix *tempT = new CsrMatrix; 1074 /* First convert HYB to CSR */ 1075 temp->num_rows = A->rmap->n; 1076 temp->num_cols = A->cmap->n; 1077 temp->num_entries = a->nz; 1078 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1079 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1080 temp->values = new THRUSTARRAY(a->nz); 1081 1082 stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get()); 1083 PetscCallCUSPARSE(stat); 1084 1085 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1086 tempT->num_rows = A->rmap->n; 1087 tempT->num_cols = A->cmap->n; 1088 tempT->num_entries = a->nz; 1089 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1090 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1091 tempT->values = new THRUSTARRAY(a->nz); 1092 1093 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(), 1094 tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1095 PetscCallCUSPARSE(stat); 1096 1097 /* Last, convert CSC to HYB */ 1098 cusparseHybMat_t hybMat; 1099 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 1100 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1101 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition); 1102 PetscCallCUSPARSE(stat); 1103 1104 /* assign the pointer */ 1105 matstructT->mat = hybMat; 1106 A->transupdated = PETSC_TRUE; 1107 /* delete temporaries */ 1108 if (tempT) { 1109 if (tempT->values) delete (THRUSTARRAY *)tempT->values; 1110 if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices; 1111 if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets; 1112 delete (CsrMatrix *)tempT; 1113 } 1114 if (temp) { 1115 if (temp->values) delete (THRUSTARRAY *)temp->values; 1116 if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices; 1117 if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets; 1118 delete (CsrMatrix *)temp; 1119 } 1120 #endif 1121 } 1122 } 1123 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1124 CsrMatrix *matrix = (CsrMatrix *)matstruct->mat; 1125 CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat; 1126 PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix"); 1127 PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows"); 1128 PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols"); 1129 PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values"); 1130 PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT"); 1131 PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows"); 1132 PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols"); 1133 PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values"); 1134 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1135 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1136 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1137 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 1138 } 1139 if (!cusparsestruct->csr2csc_i) { 1140 THRUSTARRAY csr2csc_a(matrix->num_entries); 1141 PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1142 1143 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1144 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1145 void *csr2cscBuffer; 1146 size_t csr2cscBufferSize; 1147 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1148 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize); 1149 PetscCallCUSPARSE(stat); 1150 PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize)); 1151 #endif 1152 1153 if (matrix->num_entries) { 1154 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1155 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1156 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1157 1158 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1159 should be filled with indexBase. So I just take a shortcut here. 1160 */ 1161 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1162 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1163 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer); 1164 PetscCallCUSPARSE(stat); 1165 #else 1166 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1167 PetscCallCUSPARSE(stat); 1168 #endif 1169 } else { 1170 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 1171 } 1172 1173 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1174 PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt())); 1175 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1176 PetscCallCUDA(cudaFree(csr2cscBuffer)); 1177 #endif 1178 } 1179 PetscCallThrust( 1180 thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin())); 1181 } 1182 PetscCall(PetscLogGpuTimeEnd()); 1183 PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1184 /* the compressed row indices is not used for matTranspose */ 1185 matstructT->cprowIndices = NULL; 1186 /* assign the pointer */ 1187 ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT; 1188 A->transupdated = PETSC_TRUE; 1189 PetscFunctionReturn(PETSC_SUCCESS); 1190 } 1191 1192 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1193 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1194 { 1195 PetscInt n = xx->map->n; 1196 const PetscScalar *barray; 1197 PetscScalar *xarray; 1198 thrust::device_ptr<const PetscScalar> bGPU; 1199 thrust::device_ptr<PetscScalar> xGPU; 1200 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1201 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1202 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1203 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1204 1205 PetscFunctionBegin; 1206 /* Analyze the matrix and create the transpose ... on the fly */ 1207 if (!loTriFactorT && !upTriFactorT) { 1208 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1209 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1210 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1211 } 1212 1213 /* Get the GPU pointers */ 1214 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1215 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1216 xGPU = thrust::device_pointer_cast(xarray); 1217 bGPU = thrust::device_pointer_cast(barray); 1218 1219 PetscCall(PetscLogGpuTimeBegin()); 1220 /* First, reorder with the row permutation */ 1221 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU); 1222 1223 /* First, solve U */ 1224 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1225 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1226 1227 /* Then, solve L */ 1228 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1229 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1230 1231 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1232 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin()); 1233 1234 /* Copy the temporary to the full solution. */ 1235 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU); 1236 1237 /* restore */ 1238 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1239 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1240 PetscCall(PetscLogGpuTimeEnd()); 1241 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1242 PetscFunctionReturn(PETSC_SUCCESS); 1243 } 1244 1245 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1246 { 1247 const PetscScalar *barray; 1248 PetscScalar *xarray; 1249 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1250 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1251 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1252 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1253 1254 PetscFunctionBegin; 1255 /* Analyze the matrix and create the transpose ... on the fly */ 1256 if (!loTriFactorT && !upTriFactorT) { 1257 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1258 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1259 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1260 } 1261 1262 /* Get the GPU pointers */ 1263 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1264 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1265 1266 PetscCall(PetscLogGpuTimeBegin()); 1267 /* First, solve U */ 1268 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1269 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1270 1271 /* Then, solve L */ 1272 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1273 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1274 1275 /* restore */ 1276 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1277 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1278 PetscCall(PetscLogGpuTimeEnd()); 1279 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1280 PetscFunctionReturn(PETSC_SUCCESS); 1281 } 1282 1283 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1284 { 1285 const PetscScalar *barray; 1286 PetscScalar *xarray; 1287 thrust::device_ptr<const PetscScalar> bGPU; 1288 thrust::device_ptr<PetscScalar> xGPU; 1289 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1290 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1291 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1292 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1293 1294 PetscFunctionBegin; 1295 /* Get the GPU pointers */ 1296 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1297 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1298 xGPU = thrust::device_pointer_cast(xarray); 1299 bGPU = thrust::device_pointer_cast(barray); 1300 1301 PetscCall(PetscLogGpuTimeBegin()); 1302 /* First, reorder with the row permutation */ 1303 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin()); 1304 1305 /* Next, solve L */ 1306 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 1307 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1308 1309 /* Then, solve U */ 1310 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 1311 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 1312 1313 /* Last, reorder with the column permutation */ 1314 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU); 1315 1316 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1317 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1318 PetscCall(PetscLogGpuTimeEnd()); 1319 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1320 PetscFunctionReturn(PETSC_SUCCESS); 1321 } 1322 1323 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1324 { 1325 const PetscScalar *barray; 1326 PetscScalar *xarray; 1327 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1328 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1329 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1330 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1331 1332 PetscFunctionBegin; 1333 /* Get the GPU pointers */ 1334 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1335 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1336 1337 PetscCall(PetscLogGpuTimeBegin()); 1338 /* First, solve L */ 1339 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 1340 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1341 1342 /* Next, solve U */ 1343 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 1344 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 1345 1346 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1347 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1348 PetscCall(PetscLogGpuTimeEnd()); 1349 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1350 PetscFunctionReturn(PETSC_SUCCESS); 1351 } 1352 1353 #if CUSPARSE_VERSION >= 11500 1354 /* cusparseSpSV_solve() and friends first appeared in cusparse-11.3 */ 1355 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x) 1356 { 1357 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1358 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1359 const PetscScalar *barray; 1360 PetscScalar *xarray; 1361 1362 PetscFunctionBegin; 1363 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1364 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1365 PetscCall(PetscLogGpuTimeBegin()); 1366 1367 /* Solve L*y = b */ 1368 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1369 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1370 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */ 1371 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, 1372 fs->spsvDescr_L)); // cusparseSpSV_solve() scretely uses the external buffer used in cusparseSpSV_analysis()! 1373 1374 /* Solve U*x = y */ 1375 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1376 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */ 1377 fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U)); 1378 1379 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1380 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1381 1382 PetscCall(PetscLogGpuTimeEnd()); 1383 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1384 PetscFunctionReturn(PETSC_SUCCESS); 1385 } 1386 1387 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x) 1388 { 1389 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1390 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1391 const PetscScalar *barray; 1392 PetscScalar *xarray; 1393 1394 PetscFunctionBegin; 1395 if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */ 1396 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 1397 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */ 1398 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 1399 1400 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); 1401 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 1402 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 1403 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 1404 fs->createdTransposeSpSVDescr = PETSC_TRUE; 1405 } 1406 1407 if (!fs->updatedTransposeSpSVAnalysis) { 1408 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1409 1410 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 1411 fs->updatedTransposeSpSVAnalysis = PETSC_TRUE; 1412 } 1413 1414 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1415 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1416 PetscCall(PetscLogGpuTimeBegin()); 1417 1418 /* Solve Ut*y = b */ 1419 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1420 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1421 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */ 1422 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut)); 1423 1424 /* Solve Lt*x = y */ 1425 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1426 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */ 1427 fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt)); 1428 1429 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1430 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1431 PetscCall(PetscLogGpuTimeEnd()); 1432 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1433 PetscFunctionReturn(PETSC_SUCCESS); 1434 } 1435 1436 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *) 1437 { 1438 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1439 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1440 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1441 CsrMatrix *Acsr; 1442 PetscInt m, nz; 1443 PetscBool flg; 1444 1445 PetscFunctionBegin; 1446 if (PetscDefined(USE_DEBUG)) { 1447 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1448 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1449 } 1450 1451 /* Copy A's value to fact */ 1452 m = fact->rmap->n; 1453 nz = aij->nz; 1454 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1455 Acsr = (CsrMatrix *)Acusp->mat->mat; 1456 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1457 1458 /* Factorize fact inplace */ 1459 if (m) 1460 PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1461 fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1462 if (PetscDefined(USE_DEBUG)) { 1463 int numerical_zero; 1464 cusparseStatus_t status; 1465 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero); 1466 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1467 } 1468 1469 /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02() 1470 See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78 1471 */ 1472 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1473 1474 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 1475 1476 /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */ 1477 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 1478 1479 fact->offloadmask = PETSC_OFFLOAD_GPU; 1480 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ILU0; 1481 fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_ILU0; 1482 fact->ops->matsolve = NULL; 1483 fact->ops->matsolvetranspose = NULL; 1484 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1485 PetscFunctionReturn(PETSC_SUCCESS); 1486 } 1487 1488 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info) 1489 { 1490 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1491 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1492 PetscInt m, nz; 1493 1494 PetscFunctionBegin; 1495 if (PetscDefined(USE_DEBUG)) { 1496 PetscInt i; 1497 PetscBool flg, missing; 1498 1499 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1500 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1501 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 1502 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1503 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1504 } 1505 1506 /* Free the old stale stuff */ 1507 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 1508 1509 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1510 but they will not be used. Allocate them just for easy debugging. 1511 */ 1512 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 1513 1514 fact->offloadmask = PETSC_OFFLOAD_BOTH; 1515 fact->factortype = MAT_FACTOR_ILU; 1516 fact->info.factor_mallocs = 0; 1517 fact->info.fill_ratio_given = info->fill; 1518 fact->info.fill_ratio_needed = 1.0; 1519 1520 aij->row = NULL; 1521 aij->col = NULL; 1522 1523 /* ====================================================================== */ 1524 /* Copy A's i, j to fact and also allocate the value array of fact. */ 1525 /* We'll do in-place factorization on fact */ 1526 /* ====================================================================== */ 1527 const int *Ai, *Aj; 1528 1529 m = fact->rmap->n; 1530 nz = aij->nz; 1531 1532 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1))); 1533 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz)); 1534 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz)); 1535 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */ 1536 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1537 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1538 1539 /* ====================================================================== */ 1540 /* Create descriptors for M, L, U */ 1541 /* ====================================================================== */ 1542 cusparseFillMode_t fillMode; 1543 cusparseDiagType_t diagType; 1544 1545 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 1546 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 1547 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 1548 1549 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 1550 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1551 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1552 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1553 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1554 */ 1555 fillMode = CUSPARSE_FILL_MODE_LOWER; 1556 diagType = CUSPARSE_DIAG_TYPE_UNIT; 1557 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1558 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1559 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1560 1561 fillMode = CUSPARSE_FILL_MODE_UPPER; 1562 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 1563 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1564 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1565 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1566 1567 /* ========================================================================= */ 1568 /* Query buffer sizes for csrilu0, SpSV and allocate buffers */ 1569 /* ========================================================================= */ 1570 PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M)); 1571 if (m) 1572 PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1573 fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, &fs->factBufferSize_M)); 1574 1575 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 1576 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 1577 1578 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 1579 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 1580 1581 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 1582 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 1583 1584 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 1585 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 1586 1587 /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab, 1588 and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77, 1589 spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U. 1590 To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U. 1591 */ 1592 if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) { 1593 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 1594 fs->spsvBuffer_L = fs->factBuffer_M; 1595 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 1596 } else { 1597 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M))); 1598 fs->spsvBuffer_U = fs->factBuffer_M; 1599 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 1600 } 1601 1602 /* ========================================================================== */ 1603 /* Perform analysis of ilu0 on M, SpSv on L and U */ 1604 /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/ 1605 /* ========================================================================== */ 1606 int structural_zero; 1607 cusparseStatus_t status; 1608 1609 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1610 if (m) 1611 PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1612 fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1613 if (PetscDefined(USE_DEBUG)) { 1614 /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 1615 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero); 1616 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero); 1617 } 1618 1619 /* Estimate FLOPs of the numeric factorization */ 1620 { 1621 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 1622 PetscInt *Ai, *Adiag, nzRow, nzLeft; 1623 PetscLogDouble flops = 0.0; 1624 1625 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 1626 Ai = Aseq->i; 1627 Adiag = Aseq->diag; 1628 for (PetscInt i = 0; i < m; i++) { 1629 if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */ 1630 nzRow = Ai[i + 1] - Ai[i]; 1631 nzLeft = Adiag[i] - Ai[i]; 1632 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 1633 and include the eliminated one will be updated, which incurs a multiplication and an addition. 1634 */ 1635 nzLeft = (nzRow - 1) / 2; 1636 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 1637 } 1638 } 1639 fs->numericFactFlops = flops; 1640 } 1641 fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0; 1642 PetscFunctionReturn(PETSC_SUCCESS); 1643 } 1644 1645 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x) 1646 { 1647 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1648 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1649 const PetscScalar *barray; 1650 PetscScalar *xarray; 1651 1652 PetscFunctionBegin; 1653 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1654 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1655 PetscCall(PetscLogGpuTimeBegin()); 1656 1657 /* Solve L*y = b */ 1658 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1659 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1660 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */ 1661 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L)); 1662 1663 /* Solve Lt*x = y */ 1664 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1665 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */ 1666 fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt)); 1667 1668 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1669 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1670 1671 PetscCall(PetscLogGpuTimeEnd()); 1672 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1673 PetscFunctionReturn(PETSC_SUCCESS); 1674 } 1675 1676 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *) 1677 { 1678 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1679 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1680 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1681 CsrMatrix *Acsr; 1682 PetscInt m, nz; 1683 PetscBool flg; 1684 1685 PetscFunctionBegin; 1686 if (PetscDefined(USE_DEBUG)) { 1687 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1688 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1689 } 1690 1691 /* Copy A's value to fact */ 1692 m = fact->rmap->n; 1693 nz = aij->nz; 1694 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1695 Acsr = (CsrMatrix *)Acusp->mat->mat; 1696 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1697 1698 /* Factorize fact inplace */ 1699 /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve 1700 Function csric02() only takes the lower triangular part of matrix A to perform factorization. 1701 The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored, 1702 and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not. 1703 In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided. 1704 */ 1705 if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 1706 if (PetscDefined(USE_DEBUG)) { 1707 int numerical_zero; 1708 cusparseStatus_t status; 1709 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero); 1710 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1711 } 1712 1713 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1714 1715 /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE 1716 ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F 1717 */ 1718 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1719 1720 fact->offloadmask = PETSC_OFFLOAD_GPU; 1721 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0; 1722 fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0; 1723 fact->ops->matsolve = NULL; 1724 fact->ops->matsolvetranspose = NULL; 1725 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1726 PetscFunctionReturn(PETSC_SUCCESS); 1727 } 1728 1729 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info) 1730 { 1731 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1732 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1733 PetscInt m, nz; 1734 1735 PetscFunctionBegin; 1736 if (PetscDefined(USE_DEBUG)) { 1737 PetscInt i; 1738 PetscBool flg, missing; 1739 1740 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1741 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1742 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 1743 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1744 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1745 } 1746 1747 /* Free the old stale stuff */ 1748 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 1749 1750 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1751 but they will not be used. Allocate them just for easy debugging. 1752 */ 1753 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 1754 1755 fact->offloadmask = PETSC_OFFLOAD_BOTH; 1756 fact->factortype = MAT_FACTOR_ICC; 1757 fact->info.factor_mallocs = 0; 1758 fact->info.fill_ratio_given = info->fill; 1759 fact->info.fill_ratio_needed = 1.0; 1760 1761 aij->row = NULL; 1762 aij->col = NULL; 1763 1764 /* ====================================================================== */ 1765 /* Copy A's i, j to fact and also allocate the value array of fact. */ 1766 /* We'll do in-place factorization on fact */ 1767 /* ====================================================================== */ 1768 const int *Ai, *Aj; 1769 1770 m = fact->rmap->n; 1771 nz = aij->nz; 1772 1773 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1))); 1774 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz)); 1775 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz)); 1776 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */ 1777 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1778 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1779 1780 /* ====================================================================== */ 1781 /* Create mat descriptors for M, L */ 1782 /* ====================================================================== */ 1783 cusparseFillMode_t fillMode; 1784 cusparseDiagType_t diagType; 1785 1786 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 1787 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 1788 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 1789 1790 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 1791 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1792 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1793 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1794 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1795 */ 1796 fillMode = CUSPARSE_FILL_MODE_LOWER; 1797 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 1798 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1799 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1800 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1801 1802 /* ========================================================================= */ 1803 /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */ 1804 /* ========================================================================= */ 1805 PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M)); 1806 if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, &fs->factBufferSize_M)); 1807 1808 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 1809 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 1810 1811 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 1812 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 1813 1814 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 1815 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 1816 1817 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 1818 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 1819 1820 /* To save device memory, we make the factorization buffer share with one of the solver buffer. 1821 See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(). 1822 */ 1823 if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) { 1824 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 1825 fs->spsvBuffer_L = fs->factBuffer_M; 1826 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 1827 } else { 1828 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M))); 1829 fs->spsvBuffer_Lt = fs->factBuffer_M; 1830 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 1831 } 1832 1833 /* ========================================================================== */ 1834 /* Perform analysis of ic0 on M */ 1835 /* The lower triangular part of M has the same sparsity pattern as L */ 1836 /* ========================================================================== */ 1837 int structural_zero; 1838 cusparseStatus_t status; 1839 1840 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1841 if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 1842 if (PetscDefined(USE_DEBUG)) { 1843 /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 1844 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero); 1845 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero); 1846 } 1847 1848 /* Estimate FLOPs of the numeric factorization */ 1849 { 1850 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 1851 PetscInt *Ai, nzRow, nzLeft; 1852 PetscLogDouble flops = 0.0; 1853 1854 Ai = Aseq->i; 1855 for (PetscInt i = 0; i < m; i++) { 1856 nzRow = Ai[i + 1] - Ai[i]; 1857 if (nzRow > 1) { 1858 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 1859 and include the eliminated one will be updated, which incurs a multiplication and an addition. 1860 */ 1861 nzLeft = (nzRow - 1) / 2; 1862 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 1863 } 1864 } 1865 fs->numericFactFlops = flops; 1866 } 1867 fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0; 1868 PetscFunctionReturn(PETSC_SUCCESS); 1869 } 1870 #endif 1871 1872 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1873 { 1874 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 1875 1876 PetscFunctionBegin; 1877 #if CUSPARSE_VERSION >= 11500 1878 PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 1879 if (cusparseTriFactors->factorizeOnDevice) { 1880 PetscCall(ISIdentity(isrow, &row_identity)); 1881 PetscCall(ISIdentity(iscol, &col_identity)); 1882 } 1883 if (!info->levels && row_identity && col_identity) { 1884 PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info)); 1885 } else 1886 #endif 1887 { 1888 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 1889 PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1890 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 1891 } 1892 PetscFunctionReturn(PETSC_SUCCESS); 1893 } 1894 1895 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1896 { 1897 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 1898 1899 PetscFunctionBegin; 1900 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 1901 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1902 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 1903 PetscFunctionReturn(PETSC_SUCCESS); 1904 } 1905 1906 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 1907 { 1908 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 1909 1910 PetscFunctionBegin; 1911 #if CUSPARSE_VERSION >= 11500 1912 PetscBool perm_identity = PETSC_FALSE; 1913 if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity)); 1914 if (!info->levels && perm_identity) { 1915 PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info)); 1916 } else 1917 #endif 1918 { 1919 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 1920 PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 1921 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 1922 } 1923 PetscFunctionReturn(PETSC_SUCCESS); 1924 } 1925 1926 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 1927 { 1928 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 1929 1930 PetscFunctionBegin; 1931 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 1932 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); 1933 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 1934 PetscFunctionReturn(PETSC_SUCCESS); 1935 } 1936 1937 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type) 1938 { 1939 PetscFunctionBegin; 1940 *type = MATSOLVERCUSPARSE; 1941 PetscFunctionReturn(PETSC_SUCCESS); 1942 } 1943 1944 /*MC 1945 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 1946 on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported 1947 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 1948 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 1949 CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 1950 algorithms are not recommended. This class does NOT support direct solver operations. 1951 1952 Level: beginner 1953 1954 .seealso: `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 1955 M*/ 1956 1957 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B) 1958 { 1959 PetscInt n = A->rmap->n; 1960 PetscBool factOnDevice, factOnHost; 1961 char *prefix; 1962 char factPlace[32] = "device"; /* the default */ 1963 1964 PetscFunctionBegin; 1965 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 1966 PetscCall(MatSetSizes(*B, n, n, n, n)); 1967 (*B)->factortype = ftype; 1968 PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE)); 1969 1970 prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix; 1971 PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)), prefix, "MatGetFactor", "Mat"); 1972 PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL)); 1973 PetscOptionsEnd(); 1974 PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice)); 1975 PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost)); 1976 PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)(*B)), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace); 1977 ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice; 1978 1979 if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE)); 1980 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 1981 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1982 if (!A->boundtocpu) { 1983 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 1984 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 1985 } else { 1986 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 1987 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 1988 } 1989 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 1990 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 1991 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 1992 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 1993 if (!A->boundtocpu) { 1994 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 1995 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 1996 } else { 1997 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 1998 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 1999 } 2000 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2001 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2002 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types"); 2003 2004 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 2005 (*B)->canuseordering = PETSC_TRUE; 2006 PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse)); 2007 PetscFunctionReturn(PETSC_SUCCESS); 2008 } 2009 2010 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 2011 { 2012 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2013 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2014 #if CUSPARSE_VERSION >= 13500 2015 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 2016 #endif 2017 2018 PetscFunctionBegin; 2019 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 2020 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2021 if (A->factortype == MAT_FACTOR_NONE) { 2022 CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat; 2023 PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2024 } 2025 #if CUSPARSE_VERSION >= 13500 2026 else if (fs->csrVal) { 2027 /* We have a factorized matrix on device and are able to copy it to host */ 2028 PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2029 } 2030 #endif 2031 else 2032 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host"); 2033 PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar))); 2034 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2035 A->offloadmask = PETSC_OFFLOAD_BOTH; 2036 } 2037 PetscFunctionReturn(PETSC_SUCCESS); 2038 } 2039 2040 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2041 { 2042 PetscFunctionBegin; 2043 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2044 *array = ((Mat_SeqAIJ *)A->data)->a; 2045 PetscFunctionReturn(PETSC_SUCCESS); 2046 } 2047 2048 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2049 { 2050 PetscFunctionBegin; 2051 A->offloadmask = PETSC_OFFLOAD_CPU; 2052 *array = NULL; 2053 PetscFunctionReturn(PETSC_SUCCESS); 2054 } 2055 2056 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) 2057 { 2058 PetscFunctionBegin; 2059 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2060 *array = ((Mat_SeqAIJ *)A->data)->a; 2061 PetscFunctionReturn(PETSC_SUCCESS); 2062 } 2063 2064 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[]) 2065 { 2066 PetscFunctionBegin; 2067 *array = NULL; 2068 PetscFunctionReturn(PETSC_SUCCESS); 2069 } 2070 2071 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2072 { 2073 PetscFunctionBegin; 2074 *array = ((Mat_SeqAIJ *)A->data)->a; 2075 PetscFunctionReturn(PETSC_SUCCESS); 2076 } 2077 2078 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2079 { 2080 PetscFunctionBegin; 2081 A->offloadmask = PETSC_OFFLOAD_CPU; 2082 *array = NULL; 2083 PetscFunctionReturn(PETSC_SUCCESS); 2084 } 2085 2086 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 2087 { 2088 Mat_SeqAIJCUSPARSE *cusp; 2089 CsrMatrix *matrix; 2090 2091 PetscFunctionBegin; 2092 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2093 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 2094 cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 2095 PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL"); 2096 matrix = (CsrMatrix *)cusp->mat->mat; 2097 2098 if (i) { 2099 #if !defined(PETSC_USE_64BIT_INDICES) 2100 *i = matrix->row_offsets->data().get(); 2101 #else 2102 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 2103 #endif 2104 } 2105 if (j) { 2106 #if !defined(PETSC_USE_64BIT_INDICES) 2107 *j = matrix->column_indices->data().get(); 2108 #else 2109 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 2110 #endif 2111 } 2112 if (a) *a = matrix->values->data().get(); 2113 if (mtype) *mtype = PETSC_MEMTYPE_CUDA; 2114 PetscFunctionReturn(PETSC_SUCCESS); 2115 } 2116 2117 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 2118 { 2119 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 2120 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 2121 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2122 PetscInt m = A->rmap->n, *ii, *ridx, tmp; 2123 cusparseStatus_t stat; 2124 PetscBool both = PETSC_TRUE; 2125 2126 PetscFunctionBegin; 2127 PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU"); 2128 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 2129 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 2130 CsrMatrix *matrix; 2131 matrix = (CsrMatrix *)cusparsestruct->mat->mat; 2132 2133 PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values"); 2134 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2135 matrix->values->assign(a->a, a->a + a->nz); 2136 PetscCallCUDA(WaitForCUDA()); 2137 PetscCall(PetscLogCpuToGpu((a->nz) * sizeof(PetscScalar))); 2138 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2139 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 2140 } else { 2141 PetscInt nnz; 2142 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2143 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format)); 2144 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 2145 delete cusparsestruct->workVector; 2146 delete cusparsestruct->rowoffsets_gpu; 2147 cusparsestruct->workVector = NULL; 2148 cusparsestruct->rowoffsets_gpu = NULL; 2149 try { 2150 if (a->compressedrow.use) { 2151 m = a->compressedrow.nrows; 2152 ii = a->compressedrow.i; 2153 ridx = a->compressedrow.rindex; 2154 } else { 2155 m = A->rmap->n; 2156 ii = a->i; 2157 ridx = NULL; 2158 } 2159 PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data"); 2160 if (!a->a) { 2161 nnz = ii[m]; 2162 both = PETSC_FALSE; 2163 } else nnz = a->nz; 2164 PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data"); 2165 2166 /* create cusparse matrix */ 2167 cusparsestruct->nrows = m; 2168 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 2169 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr)); 2170 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO)); 2171 PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 2172 2173 PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar))); 2174 PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero), sizeof(PetscScalar))); 2175 PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar))); 2176 PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2177 PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2178 PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2179 PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2180 2181 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 2182 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2183 /* set the matrix */ 2184 CsrMatrix *mat = new CsrMatrix; 2185 mat->num_rows = m; 2186 mat->num_cols = A->cmap->n; 2187 mat->num_entries = nnz; 2188 mat->row_offsets = new THRUSTINTARRAY32(m + 1); 2189 mat->row_offsets->assign(ii, ii + m + 1); 2190 2191 mat->column_indices = new THRUSTINTARRAY32(nnz); 2192 mat->column_indices->assign(a->j, a->j + nnz); 2193 2194 mat->values = new THRUSTARRAY(nnz); 2195 if (a->a) mat->values->assign(a->a, a->a + nnz); 2196 2197 /* assign the pointer */ 2198 matstruct->mat = mat; 2199 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2200 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 2201 stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2202 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2203 PetscCallCUSPARSE(stat); 2204 } 2205 #endif 2206 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 2207 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2208 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2209 #else 2210 CsrMatrix *mat = new CsrMatrix; 2211 mat->num_rows = m; 2212 mat->num_cols = A->cmap->n; 2213 mat->num_entries = nnz; 2214 mat->row_offsets = new THRUSTINTARRAY32(m + 1); 2215 mat->row_offsets->assign(ii, ii + m + 1); 2216 2217 mat->column_indices = new THRUSTINTARRAY32(nnz); 2218 mat->column_indices->assign(a->j, a->j + nnz); 2219 2220 mat->values = new THRUSTARRAY(nnz); 2221 if (a->a) mat->values->assign(a->a, a->a + nnz); 2222 2223 cusparseHybMat_t hybMat; 2224 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 2225 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 2226 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition); 2227 PetscCallCUSPARSE(stat); 2228 /* assign the pointer */ 2229 matstruct->mat = hybMat; 2230 2231 if (mat) { 2232 if (mat->values) delete (THRUSTARRAY *)mat->values; 2233 if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices; 2234 if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets; 2235 delete (CsrMatrix *)mat; 2236 } 2237 #endif 2238 } 2239 2240 /* assign the compressed row indices */ 2241 if (a->compressedrow.use) { 2242 cusparsestruct->workVector = new THRUSTARRAY(m); 2243 matstruct->cprowIndices = new THRUSTINTARRAY(m); 2244 matstruct->cprowIndices->assign(ridx, ridx + m); 2245 tmp = m; 2246 } else { 2247 cusparsestruct->workVector = NULL; 2248 matstruct->cprowIndices = NULL; 2249 tmp = 0; 2250 } 2251 PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar))); 2252 2253 /* assign the pointer */ 2254 cusparsestruct->mat = matstruct; 2255 } catch (char *ex) { 2256 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 2257 } 2258 PetscCallCUDA(WaitForCUDA()); 2259 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2260 cusparsestruct->nonzerostate = A->nonzerostate; 2261 } 2262 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 2263 } 2264 PetscFunctionReturn(PETSC_SUCCESS); 2265 } 2266 2267 struct VecCUDAPlusEquals { 2268 template <typename Tuple> 2269 __host__ __device__ void operator()(Tuple t) 2270 { 2271 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 2272 } 2273 }; 2274 2275 struct VecCUDAEquals { 2276 template <typename Tuple> 2277 __host__ __device__ void operator()(Tuple t) 2278 { 2279 thrust::get<1>(t) = thrust::get<0>(t); 2280 } 2281 }; 2282 2283 struct VecCUDAEqualsReverse { 2284 template <typename Tuple> 2285 __host__ __device__ void operator()(Tuple t) 2286 { 2287 thrust::get<0>(t) = thrust::get<1>(t); 2288 } 2289 }; 2290 2291 struct MatMatCusparse { 2292 PetscBool cisdense; 2293 PetscScalar *Bt; 2294 Mat X; 2295 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2296 PetscLogDouble flops; 2297 CsrMatrix *Bcsr; 2298 2299 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2300 cusparseSpMatDescr_t matSpBDescr; 2301 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2302 cusparseDnMatDescr_t matBDescr; 2303 cusparseDnMatDescr_t matCDescr; 2304 PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/ 2305 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2306 void *dBuffer4; 2307 void *dBuffer5; 2308 #endif 2309 size_t mmBufferSize; 2310 void *mmBuffer; 2311 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2312 cusparseSpGEMMDescr_t spgemmDesc; 2313 #endif 2314 }; 2315 2316 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2317 { 2318 MatMatCusparse *mmdata = (MatMatCusparse *)data; 2319 2320 PetscFunctionBegin; 2321 PetscCallCUDA(cudaFree(mmdata->Bt)); 2322 delete mmdata->Bcsr; 2323 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2324 if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr)); 2325 if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2326 if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2327 if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2328 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2329 if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4)); 2330 if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5)); 2331 #endif 2332 if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2333 if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2)); 2334 #endif 2335 PetscCall(MatDestroy(&mmdata->X)); 2336 PetscCall(PetscFree(data)); 2337 PetscFunctionReturn(PETSC_SUCCESS); 2338 } 2339 2340 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat, Mat, Mat, PetscBool, PetscBool); 2341 2342 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2343 { 2344 Mat_Product *product = C->product; 2345 Mat A, B; 2346 PetscInt m, n, blda, clda; 2347 PetscBool flg, biscuda; 2348 Mat_SeqAIJCUSPARSE *cusp; 2349 cusparseStatus_t stat; 2350 cusparseOperation_t opA; 2351 const PetscScalar *barray; 2352 PetscScalar *carray; 2353 MatMatCusparse *mmdata; 2354 Mat_SeqAIJCUSPARSEMultStruct *mat; 2355 CsrMatrix *csrmat; 2356 2357 PetscFunctionBegin; 2358 MatCheckProduct(C, 1); 2359 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2360 mmdata = (MatMatCusparse *)product->data; 2361 A = product->A; 2362 B = product->B; 2363 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2364 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2365 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2366 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2367 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2368 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2369 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2370 switch (product->type) { 2371 case MATPRODUCT_AB: 2372 case MATPRODUCT_PtAP: 2373 mat = cusp->mat; 2374 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2375 m = A->rmap->n; 2376 n = B->cmap->n; 2377 break; 2378 case MATPRODUCT_AtB: 2379 if (!A->form_explicit_transpose) { 2380 mat = cusp->mat; 2381 opA = CUSPARSE_OPERATION_TRANSPOSE; 2382 } else { 2383 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2384 mat = cusp->matTranspose; 2385 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2386 } 2387 m = A->cmap->n; 2388 n = B->cmap->n; 2389 break; 2390 case MATPRODUCT_ABt: 2391 case MATPRODUCT_RARt: 2392 mat = cusp->mat; 2393 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2394 m = A->rmap->n; 2395 n = B->rmap->n; 2396 break; 2397 default: 2398 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2399 } 2400 PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 2401 csrmat = (CsrMatrix *)mat->mat; 2402 /* if the user passed a CPU matrix, copy the data to the GPU */ 2403 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda)); 2404 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B)); 2405 PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr)); 2406 2407 PetscCall(MatDenseGetLDA(B, &blda)); 2408 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2409 PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr)); 2410 PetscCall(MatDenseGetLDA(mmdata->X, &clda)); 2411 } else { 2412 PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr)); 2413 PetscCall(MatDenseGetLDA(C, &clda)); 2414 } 2415 2416 PetscCall(PetscLogGpuTimeBegin()); 2417 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2418 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2419 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2420 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2421 size_t mmBufferSize; 2422 if (mmdata->initialized && mmdata->Blda != blda) { 2423 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2424 mmdata->matBDescr = NULL; 2425 } 2426 if (!mmdata->matBDescr) { 2427 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2428 mmdata->Blda = blda; 2429 } 2430 2431 if (mmdata->initialized && mmdata->Clda != clda) { 2432 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2433 mmdata->matCDescr = NULL; 2434 } 2435 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2436 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2437 mmdata->Clda = clda; 2438 } 2439 2440 if (!mat->matDescr) { 2441 stat = cusparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2442 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2443 PetscCallCUSPARSE(stat); 2444 } 2445 stat = cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize); 2446 PetscCallCUSPARSE(stat); 2447 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2448 PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2449 PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize)); 2450 mmdata->mmBufferSize = mmBufferSize; 2451 } 2452 mmdata->initialized = PETSC_TRUE; 2453 } else { 2454 /* to be safe, always update pointers of the mats */ 2455 PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr, csrmat->values->data().get())); 2456 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray)); 2457 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray)); 2458 } 2459 2460 /* do cusparseSpMM, which supports transpose on B */ 2461 stat = cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer); 2462 PetscCallCUSPARSE(stat); 2463 #else 2464 PetscInt k; 2465 /* cusparseXcsrmm does not support transpose on B */ 2466 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2467 cublasHandle_t cublasv2handle; 2468 cublasStatus_t cerr; 2469 2470 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 2471 cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n); 2472 PetscCallCUBLAS(cerr); 2473 blda = B->cmap->n; 2474 k = B->cmap->n; 2475 } else { 2476 k = B->rmap->n; 2477 } 2478 2479 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2480 stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda); 2481 PetscCallCUSPARSE(stat); 2482 #endif 2483 PetscCall(PetscLogGpuTimeEnd()); 2484 PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries)); 2485 PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray)); 2486 if (product->type == MATPRODUCT_RARt) { 2487 PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray)); 2488 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE)); 2489 } else if (product->type == MATPRODUCT_PtAP) { 2490 PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray)); 2491 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE)); 2492 } else { 2493 PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray)); 2494 } 2495 if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C)); 2496 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B)); 2497 PetscFunctionReturn(PETSC_SUCCESS); 2498 } 2499 2500 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2501 { 2502 Mat_Product *product = C->product; 2503 Mat A, B; 2504 PetscInt m, n; 2505 PetscBool cisdense, flg; 2506 MatMatCusparse *mmdata; 2507 Mat_SeqAIJCUSPARSE *cusp; 2508 2509 PetscFunctionBegin; 2510 MatCheckProduct(C, 1); 2511 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 2512 A = product->A; 2513 B = product->B; 2514 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2515 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2516 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2517 PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2518 switch (product->type) { 2519 case MATPRODUCT_AB: 2520 m = A->rmap->n; 2521 n = B->cmap->n; 2522 break; 2523 case MATPRODUCT_AtB: 2524 m = A->cmap->n; 2525 n = B->cmap->n; 2526 break; 2527 case MATPRODUCT_ABt: 2528 m = A->rmap->n; 2529 n = B->rmap->n; 2530 break; 2531 case MATPRODUCT_PtAP: 2532 m = B->cmap->n; 2533 n = B->cmap->n; 2534 break; 2535 case MATPRODUCT_RARt: 2536 m = B->rmap->n; 2537 n = B->rmap->n; 2538 break; 2539 default: 2540 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2541 } 2542 PetscCall(MatSetSizes(C, m, n, m, n)); 2543 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2544 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense)); 2545 PetscCall(MatSetType(C, MATSEQDENSECUDA)); 2546 2547 /* product data */ 2548 PetscCall(PetscNew(&mmdata)); 2549 mmdata->cisdense = cisdense; 2550 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0) 2551 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2552 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar))); 2553 #endif 2554 /* for these products we need intermediate storage */ 2555 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2556 PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X)); 2557 PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA)); 2558 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2559 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n)); 2560 } else { 2561 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n)); 2562 } 2563 } 2564 C->product->data = mmdata; 2565 C->product->destroy = MatDestroy_MatMatCusparse; 2566 2567 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2568 PetscFunctionReturn(PETSC_SUCCESS); 2569 } 2570 2571 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2572 { 2573 Mat_Product *product = C->product; 2574 Mat A, B; 2575 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 2576 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 2577 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 2578 CsrMatrix *Acsr, *Bcsr, *Ccsr; 2579 PetscBool flg; 2580 cusparseStatus_t stat; 2581 MatProductType ptype; 2582 MatMatCusparse *mmdata; 2583 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2584 cusparseSpMatDescr_t BmatSpDescr; 2585 #endif 2586 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2587 2588 PetscFunctionBegin; 2589 MatCheckProduct(C, 1); 2590 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2591 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg)); 2592 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name); 2593 mmdata = (MatMatCusparse *)C->product->data; 2594 A = product->A; 2595 B = product->B; 2596 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2597 mmdata->reusesym = PETSC_FALSE; 2598 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2599 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2600 Cmat = Ccusp->mat; 2601 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]); 2602 Ccsr = (CsrMatrix *)Cmat->mat; 2603 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2604 goto finalize; 2605 } 2606 if (!c->nz) goto finalize; 2607 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2608 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2609 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 2610 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 2611 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2612 PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2613 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2614 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 2615 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2616 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2617 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2618 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2619 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2620 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2621 2622 ptype = product->type; 2623 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 2624 ptype = MATPRODUCT_AB; 2625 PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric"); 2626 } 2627 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 2628 ptype = MATPRODUCT_AB; 2629 PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric"); 2630 } 2631 switch (ptype) { 2632 case MATPRODUCT_AB: 2633 Amat = Acusp->mat; 2634 Bmat = Bcusp->mat; 2635 break; 2636 case MATPRODUCT_AtB: 2637 Amat = Acusp->matTranspose; 2638 Bmat = Bcusp->mat; 2639 break; 2640 case MATPRODUCT_ABt: 2641 Amat = Acusp->mat; 2642 Bmat = Bcusp->matTranspose; 2643 break; 2644 default: 2645 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2646 } 2647 Cmat = Ccusp->mat; 2648 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 2649 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 2650 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]); 2651 Acsr = (CsrMatrix *)Amat->mat; 2652 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */ 2653 Ccsr = (CsrMatrix *)Cmat->mat; 2654 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 2655 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 2656 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2657 PetscCall(PetscLogGpuTimeBegin()); 2658 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2659 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2660 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2661 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2662 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2663 PetscCallCUSPARSE(stat); 2664 #else 2665 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 2666 PetscCallCUSPARSE(stat); 2667 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2668 PetscCallCUSPARSE(stat); 2669 #endif 2670 #else 2671 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 2672 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 2673 PetscCallCUSPARSE(stat); 2674 #endif 2675 PetscCall(PetscLogGpuFlops(mmdata->flops)); 2676 PetscCallCUDA(WaitForCUDA()); 2677 PetscCall(PetscLogGpuTimeEnd()); 2678 C->offloadmask = PETSC_OFFLOAD_GPU; 2679 finalize: 2680 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2681 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 2682 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 2683 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 2684 c->reallocs = 0; 2685 C->info.mallocs += 0; 2686 C->info.nz_unneeded = 0; 2687 C->assembled = C->was_assembled = PETSC_TRUE; 2688 C->num_ass++; 2689 PetscFunctionReturn(PETSC_SUCCESS); 2690 } 2691 2692 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2693 { 2694 Mat_Product *product = C->product; 2695 Mat A, B; 2696 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 2697 Mat_SeqAIJ *a, *b, *c; 2698 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 2699 CsrMatrix *Acsr, *Bcsr, *Ccsr; 2700 PetscInt i, j, m, n, k; 2701 PetscBool flg; 2702 cusparseStatus_t stat; 2703 MatProductType ptype; 2704 MatMatCusparse *mmdata; 2705 PetscLogDouble flops; 2706 PetscBool biscompressed, ciscompressed; 2707 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2708 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2709 cusparseSpMatDescr_t BmatSpDescr; 2710 #else 2711 int cnz; 2712 #endif 2713 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2714 2715 PetscFunctionBegin; 2716 MatCheckProduct(C, 1); 2717 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 2718 A = product->A; 2719 B = product->B; 2720 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2721 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2722 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 2723 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 2724 a = (Mat_SeqAIJ *)A->data; 2725 b = (Mat_SeqAIJ *)B->data; 2726 /* product data */ 2727 PetscCall(PetscNew(&mmdata)); 2728 C->product->data = mmdata; 2729 C->product->destroy = MatDestroy_MatMatCusparse; 2730 2731 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2732 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2733 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 2734 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 2735 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2736 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2737 2738 ptype = product->type; 2739 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 2740 ptype = MATPRODUCT_AB; 2741 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 2742 } 2743 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 2744 ptype = MATPRODUCT_AB; 2745 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 2746 } 2747 biscompressed = PETSC_FALSE; 2748 ciscompressed = PETSC_FALSE; 2749 switch (ptype) { 2750 case MATPRODUCT_AB: 2751 m = A->rmap->n; 2752 n = B->cmap->n; 2753 k = A->cmap->n; 2754 Amat = Acusp->mat; 2755 Bmat = Bcusp->mat; 2756 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2757 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2758 break; 2759 case MATPRODUCT_AtB: 2760 m = A->cmap->n; 2761 n = B->cmap->n; 2762 k = A->rmap->n; 2763 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2764 Amat = Acusp->matTranspose; 2765 Bmat = Bcusp->mat; 2766 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2767 break; 2768 case MATPRODUCT_ABt: 2769 m = A->rmap->n; 2770 n = B->rmap->n; 2771 k = A->cmap->n; 2772 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 2773 Amat = Acusp->mat; 2774 Bmat = Bcusp->matTranspose; 2775 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2776 break; 2777 default: 2778 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2779 } 2780 2781 /* create cusparse matrix */ 2782 PetscCall(MatSetSizes(C, m, n, m, n)); 2783 PetscCall(MatSetType(C, MATSEQAIJCUSPARSE)); 2784 c = (Mat_SeqAIJ *)C->data; 2785 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2786 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2787 Ccsr = new CsrMatrix; 2788 2789 c->compressedrow.use = ciscompressed; 2790 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2791 c->compressedrow.nrows = a->compressedrow.nrows; 2792 PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex)); 2793 PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows)); 2794 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2795 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2796 Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows); 2797 } else { 2798 c->compressedrow.nrows = 0; 2799 c->compressedrow.i = NULL; 2800 c->compressedrow.rindex = NULL; 2801 Ccusp->workVector = NULL; 2802 Cmat->cprowIndices = NULL; 2803 } 2804 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2805 Ccusp->mat = Cmat; 2806 Ccusp->mat->mat = Ccsr; 2807 Ccsr->num_rows = Ccusp->nrows; 2808 Ccsr->num_cols = n; 2809 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1); 2810 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 2811 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 2812 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 2813 PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar))); 2814 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar))); 2815 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 2816 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2817 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2818 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2819 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2820 thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0); 2821 c->nz = 0; 2822 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2823 Ccsr->values = new THRUSTARRAY(c->nz); 2824 goto finalizesym; 2825 } 2826 2827 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 2828 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 2829 Acsr = (CsrMatrix *)Amat->mat; 2830 if (!biscompressed) { 2831 Bcsr = (CsrMatrix *)Bmat->mat; 2832 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2833 BmatSpDescr = Bmat->matDescr; 2834 #endif 2835 } else { /* we need to use row offsets for the full matrix */ 2836 CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat; 2837 Bcsr = new CsrMatrix; 2838 Bcsr->num_rows = B->rmap->n; 2839 Bcsr->num_cols = cBcsr->num_cols; 2840 Bcsr->num_entries = cBcsr->num_entries; 2841 Bcsr->column_indices = cBcsr->column_indices; 2842 Bcsr->values = cBcsr->values; 2843 if (!Bcusp->rowoffsets_gpu) { 2844 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2845 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 2846 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 2847 } 2848 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2849 mmdata->Bcsr = Bcsr; 2850 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2851 if (Bcsr->num_rows && Bcsr->num_cols) { 2852 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2853 PetscCallCUSPARSE(stat); 2854 } 2855 BmatSpDescr = mmdata->matSpBDescr; 2856 #endif 2857 } 2858 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 2859 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 2860 /* precompute flops count */ 2861 if (ptype == MATPRODUCT_AB) { 2862 for (i = 0, flops = 0; i < A->rmap->n; i++) { 2863 const PetscInt st = a->i[i]; 2864 const PetscInt en = a->i[i + 1]; 2865 for (j = st; j < en; j++) { 2866 const PetscInt brow = a->j[j]; 2867 flops += 2. * (b->i[brow + 1] - b->i[brow]); 2868 } 2869 } 2870 } else if (ptype == MATPRODUCT_AtB) { 2871 for (i = 0, flops = 0; i < A->rmap->n; i++) { 2872 const PetscInt anzi = a->i[i + 1] - a->i[i]; 2873 const PetscInt bnzi = b->i[i + 1] - b->i[i]; 2874 flops += (2. * anzi) * bnzi; 2875 } 2876 } else { /* TODO */ 2877 flops = 0.; 2878 } 2879 2880 mmdata->flops = flops; 2881 PetscCall(PetscLogGpuTimeBegin()); 2882 2883 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2884 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2885 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, NULL, NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2886 PetscCallCUSPARSE(stat); 2887 PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 2888 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2889 { 2890 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 2891 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 2892 */ 2893 void *dBuffer1 = NULL; 2894 void *dBuffer2 = NULL; 2895 void *dBuffer3 = NULL; 2896 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 2897 size_t bufferSize1 = 0; 2898 size_t bufferSize2 = 0; 2899 size_t bufferSize3 = 0; 2900 size_t bufferSize4 = 0; 2901 size_t bufferSize5 = 0; 2902 2903 /*----------------------------------------------------------------------*/ 2904 /* ask bufferSize1 bytes for external memory */ 2905 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL); 2906 PetscCallCUSPARSE(stat); 2907 PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1)); 2908 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2909 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1); 2910 PetscCallCUSPARSE(stat); 2911 2912 /*----------------------------------------------------------------------*/ 2913 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL); 2914 PetscCallCUSPARSE(stat); 2915 PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2)); 2916 PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3)); 2917 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4)); 2918 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4); 2919 PetscCallCUSPARSE(stat); 2920 PetscCallCUDA(cudaFree(dBuffer1)); 2921 PetscCallCUDA(cudaFree(dBuffer2)); 2922 2923 /*----------------------------------------------------------------------*/ 2924 /* get matrix C non-zero entries C_nnz1 */ 2925 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 2926 c->nz = (PetscInt)C_nnz1; 2927 /* allocate matrix C */ 2928 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2929 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2930 Ccsr->values = new THRUSTARRAY(c->nz); 2931 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2932 /* update matC with the new pointers */ 2933 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 2934 PetscCallCUSPARSE(stat); 2935 2936 /*----------------------------------------------------------------------*/ 2937 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL); 2938 PetscCallCUSPARSE(stat); 2939 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5)); 2940 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5); 2941 PetscCallCUSPARSE(stat); 2942 PetscCallCUDA(cudaFree(dBuffer3)); 2943 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2944 PetscCallCUSPARSE(stat); 2945 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024)); 2946 } 2947 #else 2948 size_t bufSize2; 2949 /* ask bufferSize bytes for external memory */ 2950 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL); 2951 PetscCallCUSPARSE(stat); 2952 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2)); 2953 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2954 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2); 2955 PetscCallCUSPARSE(stat); 2956 /* ask bufferSize again bytes for external memory */ 2957 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL); 2958 PetscCallCUSPARSE(stat); 2959 /* The CUSPARSE documentation is not clear, nor the API 2960 We need both buffers to perform the operations properly! 2961 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2962 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2963 is stored in the descriptor! What a messy API... */ 2964 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize)); 2965 /* compute the intermediate product of A * B */ 2966 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 2967 PetscCallCUSPARSE(stat); 2968 /* get matrix C non-zero entries C_nnz1 */ 2969 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 2970 c->nz = (PetscInt)C_nnz1; 2971 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024, 2972 mmdata->mmBufferSize / 1024)); 2973 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2974 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2975 Ccsr->values = new THRUSTARRAY(c->nz); 2976 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2977 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 2978 PetscCallCUSPARSE(stat); 2979 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2980 PetscCallCUSPARSE(stat); 2981 #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2982 #else 2983 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST)); 2984 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 2985 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz); 2986 PetscCallCUSPARSE(stat); 2987 c->nz = cnz; 2988 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2989 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2990 Ccsr->values = new THRUSTARRAY(c->nz); 2991 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2992 2993 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2994 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2995 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2996 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2997 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 2998 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 2999 PetscCallCUSPARSE(stat); 3000 #endif 3001 PetscCall(PetscLogGpuFlops(mmdata->flops)); 3002 PetscCall(PetscLogGpuTimeEnd()); 3003 finalizesym: 3004 c->singlemalloc = PETSC_FALSE; 3005 c->free_a = PETSC_TRUE; 3006 c->free_ij = PETSC_TRUE; 3007 PetscCall(PetscMalloc1(m + 1, &c->i)); 3008 PetscCall(PetscMalloc1(c->nz, &c->j)); 3009 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 3010 PetscInt *d_i = c->i; 3011 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 3012 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 3013 ii = *Ccsr->row_offsets; 3014 jj = *Ccsr->column_indices; 3015 if (ciscompressed) d_i = c->compressedrow.i; 3016 PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3017 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3018 } else { 3019 PetscInt *d_i = c->i; 3020 if (ciscompressed) d_i = c->compressedrow.i; 3021 PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3022 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3023 } 3024 if (ciscompressed) { /* need to expand host row offsets */ 3025 PetscInt r = 0; 3026 c->i[0] = 0; 3027 for (k = 0; k < c->compressedrow.nrows; k++) { 3028 const PetscInt next = c->compressedrow.rindex[k]; 3029 const PetscInt old = c->compressedrow.i[k]; 3030 for (; r < next; r++) c->i[r + 1] = old; 3031 } 3032 for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows]; 3033 } 3034 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 3035 PetscCall(PetscMalloc1(m, &c->ilen)); 3036 PetscCall(PetscMalloc1(m, &c->imax)); 3037 c->maxnz = c->nz; 3038 c->nonzerorowcnt = 0; 3039 c->rmax = 0; 3040 for (k = 0; k < m; k++) { 3041 const PetscInt nn = c->i[k + 1] - c->i[k]; 3042 c->ilen[k] = c->imax[k] = nn; 3043 c->nonzerorowcnt += (PetscInt) !!nn; 3044 c->rmax = PetscMax(c->rmax, nn); 3045 } 3046 PetscCall(MatMarkDiagonal_SeqAIJ(C)); 3047 PetscCall(PetscMalloc1(c->nz, &c->a)); 3048 Ccsr->num_entries = c->nz; 3049 3050 C->nonzerostate++; 3051 PetscCall(PetscLayoutSetUp(C->rmap)); 3052 PetscCall(PetscLayoutSetUp(C->cmap)); 3053 Ccusp->nonzerostate = C->nonzerostate; 3054 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3055 C->preallocated = PETSC_TRUE; 3056 C->assembled = PETSC_FALSE; 3057 C->was_assembled = PETSC_FALSE; 3058 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 3059 mmdata->reusesym = PETSC_TRUE; 3060 C->offloadmask = PETSC_OFFLOAD_GPU; 3061 } 3062 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3063 PetscFunctionReturn(PETSC_SUCCESS); 3064 } 3065 3066 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 3067 3068 /* handles sparse or dense B */ 3069 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 3070 { 3071 Mat_Product *product = mat->product; 3072 PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE; 3073 3074 PetscFunctionBegin; 3075 MatCheckProduct(mat, 1); 3076 PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense)); 3077 if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp)); 3078 if (product->type == MATPRODUCT_ABC) { 3079 Ciscusp = PETSC_FALSE; 3080 if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp)); 3081 } 3082 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 3083 PetscBool usecpu = PETSC_FALSE; 3084 switch (product->type) { 3085 case MATPRODUCT_AB: 3086 if (product->api_user) { 3087 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat"); 3088 PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3089 PetscOptionsEnd(); 3090 } else { 3091 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat"); 3092 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3093 PetscOptionsEnd(); 3094 } 3095 break; 3096 case MATPRODUCT_AtB: 3097 if (product->api_user) { 3098 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat"); 3099 PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3100 PetscOptionsEnd(); 3101 } else { 3102 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat"); 3103 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3104 PetscOptionsEnd(); 3105 } 3106 break; 3107 case MATPRODUCT_PtAP: 3108 if (product->api_user) { 3109 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat"); 3110 PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3111 PetscOptionsEnd(); 3112 } else { 3113 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat"); 3114 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3115 PetscOptionsEnd(); 3116 } 3117 break; 3118 case MATPRODUCT_RARt: 3119 if (product->api_user) { 3120 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat"); 3121 PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3122 PetscOptionsEnd(); 3123 } else { 3124 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat"); 3125 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3126 PetscOptionsEnd(); 3127 } 3128 break; 3129 case MATPRODUCT_ABC: 3130 if (product->api_user) { 3131 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat"); 3132 PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3133 PetscOptionsEnd(); 3134 } else { 3135 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat"); 3136 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3137 PetscOptionsEnd(); 3138 } 3139 break; 3140 default: 3141 break; 3142 } 3143 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 3144 } 3145 /* dispatch */ 3146 if (isdense) { 3147 switch (product->type) { 3148 case MATPRODUCT_AB: 3149 case MATPRODUCT_AtB: 3150 case MATPRODUCT_ABt: 3151 case MATPRODUCT_PtAP: 3152 case MATPRODUCT_RARt: 3153 if (product->A->boundtocpu) { 3154 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 3155 } else { 3156 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 3157 } 3158 break; 3159 case MATPRODUCT_ABC: 3160 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3161 break; 3162 default: 3163 break; 3164 } 3165 } else if (Biscusp && Ciscusp) { 3166 switch (product->type) { 3167 case MATPRODUCT_AB: 3168 case MATPRODUCT_AtB: 3169 case MATPRODUCT_ABt: 3170 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3171 break; 3172 case MATPRODUCT_PtAP: 3173 case MATPRODUCT_RARt: 3174 case MATPRODUCT_ABC: 3175 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3176 break; 3177 default: 3178 break; 3179 } 3180 } else { /* fallback for AIJ */ 3181 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 3182 } 3183 PetscFunctionReturn(PETSC_SUCCESS); 3184 } 3185 3186 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3187 { 3188 PetscFunctionBegin; 3189 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE)); 3190 PetscFunctionReturn(PETSC_SUCCESS); 3191 } 3192 3193 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3194 { 3195 PetscFunctionBegin; 3196 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE)); 3197 PetscFunctionReturn(PETSC_SUCCESS); 3198 } 3199 3200 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3201 { 3202 PetscFunctionBegin; 3203 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE)); 3204 PetscFunctionReturn(PETSC_SUCCESS); 3205 } 3206 3207 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3208 { 3209 PetscFunctionBegin; 3210 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE)); 3211 PetscFunctionReturn(PETSC_SUCCESS); 3212 } 3213 3214 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3215 { 3216 PetscFunctionBegin; 3217 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE)); 3218 PetscFunctionReturn(PETSC_SUCCESS); 3219 } 3220 3221 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y) 3222 { 3223 int i = blockIdx.x * blockDim.x + threadIdx.x; 3224 if (i < n) y[idx[i]] += x[i]; 3225 } 3226 3227 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 3228 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm) 3229 { 3230 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3231 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 3232 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3233 PetscScalar *xarray, *zarray, *dptr, *beta, *xptr; 3234 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3235 PetscBool compressed; 3236 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3237 PetscInt nx, ny; 3238 #endif 3239 3240 PetscFunctionBegin; 3241 PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported"); 3242 if (!a->nz) { 3243 if (yy) PetscCall(VecSeq_CUDA::copy(yy, zz)); 3244 else PetscCall(VecSeq_CUDA::set(zz, 0)); 3245 PetscFunctionReturn(PETSC_SUCCESS); 3246 } 3247 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 3248 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3249 if (!trans) { 3250 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3251 PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3252 } else { 3253 if (herm || !A->form_explicit_transpose) { 3254 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3255 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3256 } else { 3257 if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3258 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 3259 } 3260 } 3261 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3262 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3263 3264 try { 3265 PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray)); 3266 if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get uptodate zarray on GPU */ 3267 else PetscCall(VecCUDAGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */ 3268 3269 PetscCall(PetscLogGpuTimeBegin()); 3270 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3271 /* z = A x + beta y. 3272 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3273 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3274 */ 3275 xptr = xarray; 3276 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3277 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3278 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3279 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3280 allocated to accommodate different uses. So we get the length info directly from mat. 3281 */ 3282 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3283 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3284 nx = mat->num_cols; 3285 ny = mat->num_rows; 3286 } 3287 #endif 3288 } else { 3289 /* z = A^T x + beta y 3290 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3291 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3292 */ 3293 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3294 dptr = zarray; 3295 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3296 if (compressed) { /* Scatter x to work vector */ 3297 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3298 3299 thrust::for_each( 3300 #if PetscDefined(HAVE_THRUST_ASYNC) 3301 thrust::cuda::par.on(PetscDefaultCudaStream), 3302 #endif 3303 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3304 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse()); 3305 } 3306 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3307 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3308 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3309 nx = mat->num_rows; 3310 ny = mat->num_cols; 3311 } 3312 #endif 3313 } 3314 3315 /* csr_spmv does y = alpha op(A) x + beta y */ 3316 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3317 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3318 PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3319 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3320 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype)); 3321 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype)); 3322 PetscCallCUSPARSE( 3323 cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize)); 3324 PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize)); 3325 3326 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3327 } else { 3328 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3329 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr)); 3330 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr)); 3331 } 3332 3333 PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 3334 matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer)); 3335 #else 3336 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3337 PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr)); 3338 #endif 3339 } else { 3340 if (cusparsestruct->nrows) { 3341 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3342 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3343 #else 3344 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3345 PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr)); 3346 #endif 3347 } 3348 } 3349 PetscCall(PetscLogGpuTimeEnd()); 3350 3351 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3352 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3353 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3354 PetscCall(VecSeq_CUDA::copy(yy, zz)); /* zz = yy */ 3355 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3356 PetscCall(VecSeq_CUDA::axpy(zz, 1.0, yy)); /* zz += yy */ 3357 } 3358 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3359 PetscCall(VecSeq_CUDA::set(zz, 0)); 3360 } 3361 3362 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3363 if (compressed) { 3364 PetscCall(PetscLogGpuTimeBegin()); 3365 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered) 3366 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3367 prevent that. So I just add a ScatterAdd kernel. 3368 */ 3369 #if 0 3370 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3371 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3372 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3373 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3374 VecCUDAPlusEquals()); 3375 #else 3376 PetscInt n = matstruct->cprowIndices->size(); 3377 ScatterAdd<<<(n + 255) / 256, 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray); 3378 #endif 3379 PetscCall(PetscLogGpuTimeEnd()); 3380 } 3381 } else { 3382 if (yy && yy != zz) PetscCall(VecSeq_CUDA::axpy(zz, 1.0, yy)); /* zz += yy */ 3383 } 3384 PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray)); 3385 if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray)); 3386 else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray)); 3387 } catch (char *ex) { 3388 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 3389 } 3390 if (yy) { 3391 PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 3392 } else { 3393 PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 3394 } 3395 PetscFunctionReturn(PETSC_SUCCESS); 3396 } 3397 3398 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3399 { 3400 PetscFunctionBegin; 3401 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE)); 3402 PetscFunctionReturn(PETSC_SUCCESS); 3403 } 3404 3405 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode) 3406 { 3407 PetscObjectState onnz = A->nonzerostate; 3408 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 3409 3410 PetscFunctionBegin; 3411 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 3412 if (onnz != A->nonzerostate && cusp->deviceMat) { 3413 PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n")); 3414 PetscCallCUDA(cudaFree(cusp->deviceMat)); 3415 cusp->deviceMat = NULL; 3416 } 3417 PetscFunctionReturn(PETSC_SUCCESS); 3418 } 3419 3420 /* --------------------------------------------------------------------------------*/ 3421 /*@ 3422 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format 3423 (the default parallel PETSc format). This matrix will ultimately pushed down 3424 to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix 3425 assembly performance the user should preallocate the matrix storage by setting 3426 the parameter nz (or the array nnz). By setting these parameters accurately, 3427 performance during matrix assembly can be increased by more than a factor of 50. 3428 3429 Collective 3430 3431 Input Parameters: 3432 + comm - MPI communicator, set to `PETSC_COMM_SELF` 3433 . m - number of rows 3434 . n - number of columns 3435 . nz - number of nonzeros per row (same for all rows) 3436 - nnz - array containing the number of nonzeros in the various rows 3437 (possibly different for each row) or NULL 3438 3439 Output Parameter: 3440 . A - the matrix 3441 3442 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 3443 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3444 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 3445 3446 Notes: 3447 If nnz is given then nz is ignored 3448 3449 The AIJ format, also called 3450 compressed row storage, is fully compatible with standard Fortran 77 3451 storage. That is, the stored row and column indices can begin at 3452 either one (as in Fortran) or zero. See the users' manual for details. 3453 3454 Specify the preallocated storage with either nz or nnz (not both). 3455 Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory 3456 allocation. For large problems you MUST preallocate memory or you 3457 will get TERRIBLE performance, see the users' manual chapter on matrices. 3458 3459 By default, this format uses inodes (identical nodes) when possible, to 3460 improve numerical efficiency of matrix-vector products and solves. We 3461 search for consecutive rows with the same nonzero structure, thereby 3462 reusing matrix information to achieve increased efficiency. 3463 3464 Level: intermediate 3465 3466 .seealso: `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE` 3467 @*/ 3468 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 3469 { 3470 PetscFunctionBegin; 3471 PetscCall(MatCreate(comm, A)); 3472 PetscCall(MatSetSizes(*A, m, n, m, n)); 3473 PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE)); 3474 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 3475 PetscFunctionReturn(PETSC_SUCCESS); 3476 } 3477 3478 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3479 { 3480 PetscFunctionBegin; 3481 if (A->factortype == MAT_FACTOR_NONE) { 3482 PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE **)&A->spptr)); 3483 } else { 3484 PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr)); 3485 } 3486 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 3487 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL)); 3488 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL)); 3489 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 3490 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 3491 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 3492 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 3493 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 3494 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 3495 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL)); 3496 PetscCall(MatDestroy_SeqAIJ(A)); 3497 PetscFunctionReturn(PETSC_SUCCESS); 3498 } 3499 3500 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *); 3501 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool); 3502 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B) 3503 { 3504 PetscFunctionBegin; 3505 PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B)); 3506 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B)); 3507 PetscFunctionReturn(PETSC_SUCCESS); 3508 } 3509 3510 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str) 3511 { 3512 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 3513 Mat_SeqAIJCUSPARSE *cy; 3514 Mat_SeqAIJCUSPARSE *cx; 3515 PetscScalar *ay; 3516 const PetscScalar *ax; 3517 CsrMatrix *csry, *csrx; 3518 3519 PetscFunctionBegin; 3520 cy = (Mat_SeqAIJCUSPARSE *)Y->spptr; 3521 cx = (Mat_SeqAIJCUSPARSE *)X->spptr; 3522 if (X->ops->axpy != Y->ops->axpy) { 3523 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3524 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3525 PetscFunctionReturn(PETSC_SUCCESS); 3526 } 3527 /* if we are here, it means both matrices are bound to GPU */ 3528 PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y)); 3529 PetscCall(MatSeqAIJCUSPARSECopyToGPU(X)); 3530 PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3531 PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3532 csry = (CsrMatrix *)cy->mat->mat; 3533 csrx = (CsrMatrix *)cx->mat->mat; 3534 /* see if we can turn this into a cublas axpy */ 3535 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3536 bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin()); 3537 if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin()); 3538 if (eq) str = SAME_NONZERO_PATTERN; 3539 } 3540 /* spgeam is buggy with one column */ 3541 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3542 3543 if (str == SUBSET_NONZERO_PATTERN) { 3544 PetscScalar b = 1.0; 3545 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3546 size_t bufferSize; 3547 void *buffer; 3548 #endif 3549 3550 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 3551 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3552 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST)); 3553 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3554 PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3555 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize)); 3556 PetscCallCUDA(cudaMalloc(&buffer, bufferSize)); 3557 PetscCall(PetscLogGpuTimeBegin()); 3558 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3559 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer)); 3560 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3561 PetscCall(PetscLogGpuTimeEnd()); 3562 PetscCallCUDA(cudaFree(buffer)); 3563 #else 3564 PetscCall(PetscLogGpuTimeBegin()); 3565 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3566 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get())); 3567 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3568 PetscCall(PetscLogGpuTimeEnd()); 3569 #endif 3570 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3571 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 3572 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3573 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3574 } else if (str == SAME_NONZERO_PATTERN) { 3575 cublasHandle_t cublasv2handle; 3576 PetscBLASInt one = 1, bnz = 1; 3577 3578 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 3579 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3580 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3581 PetscCall(PetscBLASIntCast(x->nz, &bnz)); 3582 PetscCall(PetscLogGpuTimeBegin()); 3583 PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one)); 3584 PetscCall(PetscLogGpuFlops(2.0 * bnz)); 3585 PetscCall(PetscLogGpuTimeEnd()); 3586 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 3587 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3588 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3589 } else { 3590 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3591 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3592 } 3593 PetscFunctionReturn(PETSC_SUCCESS); 3594 } 3595 3596 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a) 3597 { 3598 Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data; 3599 PetscScalar *ay; 3600 cublasHandle_t cublasv2handle; 3601 PetscBLASInt one = 1, bnz = 1; 3602 3603 PetscFunctionBegin; 3604 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3605 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3606 PetscCall(PetscBLASIntCast(y->nz, &bnz)); 3607 PetscCall(PetscLogGpuTimeBegin()); 3608 PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one)); 3609 PetscCall(PetscLogGpuFlops(bnz)); 3610 PetscCall(PetscLogGpuTimeEnd()); 3611 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3612 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3613 PetscFunctionReturn(PETSC_SUCCESS); 3614 } 3615 3616 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3617 { 3618 PetscBool both = PETSC_FALSE; 3619 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3620 3621 PetscFunctionBegin; 3622 if (A->factortype == MAT_FACTOR_NONE) { 3623 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr; 3624 if (spptr->mat) { 3625 CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat; 3626 if (matrix->values) { 3627 both = PETSC_TRUE; 3628 thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3629 } 3630 } 3631 if (spptr->matTranspose) { 3632 CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat; 3633 if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3634 } 3635 } 3636 PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n])); 3637 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 3638 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3639 else A->offloadmask = PETSC_OFFLOAD_CPU; 3640 PetscFunctionReturn(PETSC_SUCCESS); 3641 } 3642 3643 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg) 3644 { 3645 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3646 3647 PetscFunctionBegin; 3648 if (A->factortype != MAT_FACTOR_NONE) { 3649 A->boundtocpu = flg; 3650 PetscFunctionReturn(PETSC_SUCCESS); 3651 } 3652 if (flg) { 3653 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 3654 3655 A->ops->scale = MatScale_SeqAIJ; 3656 A->ops->axpy = MatAXPY_SeqAIJ; 3657 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3658 A->ops->mult = MatMult_SeqAIJ; 3659 A->ops->multadd = MatMultAdd_SeqAIJ; 3660 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3661 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3662 A->ops->multhermitiantranspose = NULL; 3663 A->ops->multhermitiantransposeadd = NULL; 3664 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3665 PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps))); 3666 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 3667 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 3668 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 3669 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 3670 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 3671 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 3672 } else { 3673 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3674 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3675 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3676 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3677 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3678 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3679 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3680 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3681 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3682 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3683 a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 3684 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 3685 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 3686 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 3687 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 3688 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 3689 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE; 3690 3691 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE)); 3692 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 3693 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 3694 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE)); 3695 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE)); 3696 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 3697 } 3698 A->boundtocpu = flg; 3699 if (flg && a->inode.size) { 3700 a->inode.use = PETSC_TRUE; 3701 } else { 3702 a->inode.use = PETSC_FALSE; 3703 } 3704 PetscFunctionReturn(PETSC_SUCCESS); 3705 } 3706 3707 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat) 3708 { 3709 Mat B; 3710 3711 PetscFunctionBegin; 3712 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */ 3713 if (reuse == MAT_INITIAL_MATRIX) { 3714 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); 3715 } else if (reuse == MAT_REUSE_MATRIX) { 3716 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); 3717 } 3718 B = *newmat; 3719 3720 PetscCall(PetscFree(B->defaultvectype)); 3721 PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype)); 3722 3723 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3724 if (B->factortype == MAT_FACTOR_NONE) { 3725 Mat_SeqAIJCUSPARSE *spptr; 3726 PetscCall(PetscNew(&spptr)); 3727 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 3728 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 3729 spptr->format = MAT_CUSPARSE_CSR; 3730 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3731 #if CUSPARSE_VERSION > 11301 3732 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 3733 #else 3734 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3735 #endif 3736 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3737 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3738 #endif 3739 B->spptr = spptr; 3740 } else { 3741 Mat_SeqAIJCUSPARSETriFactors *spptr; 3742 3743 PetscCall(PetscNew(&spptr)); 3744 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 3745 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 3746 B->spptr = spptr; 3747 } 3748 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3749 } 3750 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3751 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3752 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 3753 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3754 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3755 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3756 3757 PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE)); 3758 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE)); 3759 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE)); 3760 #if defined(PETSC_HAVE_HYPRE) 3761 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE)); 3762 #endif 3763 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE)); 3764 PetscFunctionReturn(PETSC_SUCCESS); 3765 } 3766 3767 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3768 { 3769 PetscFunctionBegin; 3770 PetscCall(MatCreate_SeqAIJ(B)); 3771 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B)); 3772 PetscFunctionReturn(PETSC_SUCCESS); 3773 } 3774 3775 /*MC 3776 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3777 3778 A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either 3779 CSR, ELL, or Hybrid format. 3780 All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library. 3781 3782 Options Database Keys: 3783 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()` 3784 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`) during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid). 3785 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`) during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid). 3786 + -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU 3787 3788 Level: beginner 3789 3790 .seealso: `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 3791 M*/ 3792 3793 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat, MatFactorType, Mat *); 3794 3795 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3796 { 3797 PetscFunctionBegin; 3798 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse_band)); 3799 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse)); 3800 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse)); 3801 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse)); 3802 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse)); 3803 3804 PetscFunctionReturn(PETSC_SUCCESS); 3805 } 3806 3807 static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat) 3808 { 3809 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)mat->spptr; 3810 3811 PetscFunctionBegin; 3812 if (!cusp) PetscFunctionReturn(PETSC_SUCCESS); 3813 delete cusp->cooPerm; 3814 delete cusp->cooPerm_a; 3815 cusp->cooPerm = NULL; 3816 cusp->cooPerm_a = NULL; 3817 if (cusp->use_extended_coo) { 3818 PetscCallCUDA(cudaFree(cusp->jmap_d)); 3819 PetscCallCUDA(cudaFree(cusp->perm_d)); 3820 } 3821 cusp->use_extended_coo = PETSC_FALSE; 3822 PetscFunctionReturn(PETSC_SUCCESS); 3823 } 3824 3825 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3826 { 3827 PetscFunctionBegin; 3828 if (*cusparsestruct) { 3829 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat, (*cusparsestruct)->format)); 3830 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose, (*cusparsestruct)->format)); 3831 delete (*cusparsestruct)->workVector; 3832 delete (*cusparsestruct)->rowoffsets_gpu; 3833 delete (*cusparsestruct)->cooPerm; 3834 delete (*cusparsestruct)->cooPerm_a; 3835 delete (*cusparsestruct)->csr2csc_i; 3836 if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle)); 3837 if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d)); 3838 if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d)); 3839 PetscCall(PetscFree(*cusparsestruct)); 3840 } 3841 PetscFunctionReturn(PETSC_SUCCESS); 3842 } 3843 3844 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3845 { 3846 PetscFunctionBegin; 3847 if (*mat) { 3848 delete (*mat)->values; 3849 delete (*mat)->column_indices; 3850 delete (*mat)->row_offsets; 3851 delete *mat; 3852 *mat = 0; 3853 } 3854 PetscFunctionReturn(PETSC_SUCCESS); 3855 } 3856 3857 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3858 { 3859 PetscFunctionBegin; 3860 if (*trifactor) { 3861 if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr)); 3862 if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo)); 3863 PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 3864 if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer)); 3865 if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h)); 3866 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3867 if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer)); 3868 #endif 3869 PetscCall(PetscFree(*trifactor)); 3870 } 3871 PetscFunctionReturn(PETSC_SUCCESS); 3872 } 3873 3874 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format) 3875 { 3876 CsrMatrix *mat; 3877 3878 PetscFunctionBegin; 3879 if (*matstruct) { 3880 if ((*matstruct)->mat) { 3881 if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) { 3882 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3883 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3884 #else 3885 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3886 PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat)); 3887 #endif 3888 } else { 3889 mat = (CsrMatrix *)(*matstruct)->mat; 3890 PetscCall(CsrMatrix_Destroy(&mat)); 3891 } 3892 } 3893 if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr)); 3894 delete (*matstruct)->cprowIndices; 3895 if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one)); 3896 if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero)); 3897 if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one)); 3898 3899 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3900 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3901 if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr)); 3902 for (int i = 0; i < 3; i++) { 3903 if (mdata->cuSpMV[i].initialized) { 3904 PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer)); 3905 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr)); 3906 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr)); 3907 } 3908 } 3909 #endif 3910 delete *matstruct; 3911 *matstruct = NULL; 3912 } 3913 PetscFunctionReturn(PETSC_SUCCESS); 3914 } 3915 3916 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors) 3917 { 3918 Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors; 3919 3920 PetscFunctionBegin; 3921 if (fs) { 3922 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr)); 3923 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr)); 3924 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose)); 3925 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose)); 3926 delete fs->rpermIndices; 3927 delete fs->cpermIndices; 3928 delete fs->workVector; 3929 fs->rpermIndices = NULL; 3930 fs->cpermIndices = NULL; 3931 fs->workVector = NULL; 3932 if (fs->a_band_d) PetscCallCUDA(cudaFree(fs->a_band_d)); 3933 if (fs->i_band_d) PetscCallCUDA(cudaFree(fs->i_band_d)); 3934 fs->init_dev_prop = PETSC_FALSE; 3935 #if CUSPARSE_VERSION >= 11500 3936 PetscCallCUDA(cudaFree(fs->csrRowPtr)); 3937 PetscCallCUDA(cudaFree(fs->csrColIdx)); 3938 PetscCallCUDA(cudaFree(fs->csrVal)); 3939 PetscCallCUDA(cudaFree(fs->X)); 3940 PetscCallCUDA(cudaFree(fs->Y)); 3941 // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */ 3942 PetscCallCUDA(cudaFree(fs->spsvBuffer_L)); 3943 PetscCallCUDA(cudaFree(fs->spsvBuffer_U)); 3944 PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt)); 3945 PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut)); 3946 PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M)); 3947 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L)); 3948 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U)); 3949 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L)); 3950 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt)); 3951 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U)); 3952 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut)); 3953 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X)); 3954 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y)); 3955 PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M)); 3956 PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M)); 3957 3958 fs->createdTransposeSpSVDescr = PETSC_FALSE; 3959 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 3960 #endif 3961 } 3962 PetscFunctionReturn(PETSC_SUCCESS); 3963 } 3964 3965 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors) 3966 { 3967 PetscFunctionBegin; 3968 if (*trifactors) { 3969 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors)); 3970 PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle)); 3971 PetscCall(PetscFree(*trifactors)); 3972 } 3973 PetscFunctionReturn(PETSC_SUCCESS); 3974 } 3975 3976 struct IJCompare { 3977 __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3978 { 3979 if (t1.get<0>() < t2.get<0>()) return true; 3980 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3981 return false; 3982 } 3983 }; 3984 3985 struct IJEqual { 3986 __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3987 { 3988 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3989 return true; 3990 } 3991 }; 3992 3993 struct IJDiff { 3994 __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 == t2 ? 0 : 1; } 3995 }; 3996 3997 struct IJSum { 3998 __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 || t2; } 3999 }; 4000 4001 #include <thrust/iterator/discard_iterator.h> 4002 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */ 4003 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 4004 { 4005 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4006 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4007 THRUSTARRAY *cooPerm_v = NULL; 4008 thrust::device_ptr<const PetscScalar> d_v; 4009 CsrMatrix *matrix; 4010 PetscInt n; 4011 4012 PetscFunctionBegin; 4013 PetscCheck(cusp, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE struct"); 4014 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE CsrMatrix"); 4015 if (!cusp->cooPerm) { 4016 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 4017 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 4018 PetscFunctionReturn(PETSC_SUCCESS); 4019 } 4020 matrix = (CsrMatrix *)cusp->mat->mat; 4021 PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4022 if (!v) { 4023 if (imode == INSERT_VALUES) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 4024 goto finalize; 4025 } 4026 n = cusp->cooPerm->size(); 4027 if (isCudaMem(v)) { 4028 d_v = thrust::device_pointer_cast(v); 4029 } else { 4030 cooPerm_v = new THRUSTARRAY(n); 4031 cooPerm_v->assign(v, v + n); 4032 d_v = cooPerm_v->data(); 4033 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 4034 } 4035 PetscCall(PetscLogGpuTimeBegin()); 4036 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 4037 if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 4038 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 4039 auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()); 4040 /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 4041 cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 4042 cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 4043 */ 4044 thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), cooPerm_w->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>()); 4045 thrust::transform(cooPerm_w->begin(), cooPerm_w->end(), matrix->values->begin(), matrix->values->begin(), thrust::plus<PetscScalar>()); 4046 delete cooPerm_w; 4047 } else { 4048 /* all nonzeros in d_v[] are unique entries */ 4049 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin())); 4050 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end())); 4051 thrust::for_each(zibit, zieit, VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 4052 } 4053 } else { 4054 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 4055 auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()); 4056 thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), matrix->values->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>()); 4057 } else { 4058 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin())); 4059 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end())); 4060 thrust::for_each(zibit, zieit, VecCUDAEquals()); 4061 } 4062 } 4063 PetscCall(PetscLogGpuTimeEnd()); 4064 finalize: 4065 delete cooPerm_v; 4066 A->offloadmask = PETSC_OFFLOAD_GPU; 4067 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4068 /* shorter version of MatAssemblyEnd_SeqAIJ */ 4069 PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", A->rmap->n, A->cmap->n, a->nz)); 4070 PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is 0\n")); 4071 PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rmax)); 4072 a->reallocs = 0; 4073 A->info.mallocs += 0; 4074 A->info.nz_unneeded = 0; 4075 A->assembled = A->was_assembled = PETSC_TRUE; 4076 A->num_ass++; 4077 PetscFunctionReturn(PETSC_SUCCESS); 4078 } 4079 4080 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 4081 { 4082 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4083 4084 PetscFunctionBegin; 4085 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4086 if (!cusp) PetscFunctionReturn(PETSC_SUCCESS); 4087 if (destroy) { 4088 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 4089 delete cusp->csr2csc_i; 4090 cusp->csr2csc_i = NULL; 4091 } 4092 A->transupdated = PETSC_FALSE; 4093 PetscFunctionReturn(PETSC_SUCCESS); 4094 } 4095 4096 #include <thrust/binary_search.h> 4097 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */ 4098 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, PetscInt coo_i[], PetscInt coo_j[]) 4099 { 4100 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4101 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4102 PetscInt cooPerm_n, nzr = 0; 4103 4104 PetscFunctionBegin; 4105 PetscCall(PetscLayoutSetUp(A->rmap)); 4106 PetscCall(PetscLayoutSetUp(A->cmap)); 4107 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 4108 if (n != cooPerm_n) { 4109 delete cusp->cooPerm; 4110 delete cusp->cooPerm_a; 4111 cusp->cooPerm = NULL; 4112 cusp->cooPerm_a = NULL; 4113 } 4114 if (n) { 4115 thrust::device_ptr<PetscInt> d_i, d_j; 4116 PetscInt *d_raw_i, *d_raw_j; 4117 PetscBool free_raw_i = PETSC_FALSE, free_raw_j = PETSC_FALSE; 4118 PetscMemType imtype, jmtype; 4119 4120 PetscCall(PetscGetMemType(coo_i, &imtype)); 4121 if (PetscMemTypeHost(imtype)) { 4122 PetscCallCUDA(cudaMalloc(&d_raw_i, sizeof(PetscInt) * n)); 4123 PetscCallCUDA(cudaMemcpy(d_raw_i, coo_i, sizeof(PetscInt) * n, cudaMemcpyHostToDevice)); 4124 d_i = thrust::device_pointer_cast(d_raw_i); 4125 free_raw_i = PETSC_TRUE; 4126 PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt))); 4127 } else { 4128 d_i = thrust::device_pointer_cast(coo_i); 4129 } 4130 4131 PetscCall(PetscGetMemType(coo_j, &jmtype)); 4132 if (PetscMemTypeHost(jmtype)) { // MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic() passes device coo_i[] and host coo_j[]! 4133 PetscCallCUDA(cudaMalloc(&d_raw_j, sizeof(PetscInt) * n)); 4134 PetscCallCUDA(cudaMemcpy(d_raw_j, coo_j, sizeof(PetscInt) * n, cudaMemcpyHostToDevice)); 4135 d_j = thrust::device_pointer_cast(d_raw_j); 4136 free_raw_j = PETSC_TRUE; 4137 PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt))); 4138 } else { 4139 d_j = thrust::device_pointer_cast(coo_j); 4140 } 4141 4142 THRUSTINTARRAY ii(A->rmap->n); 4143 4144 if (!cusp->cooPerm) cusp->cooPerm = new THRUSTINTARRAY(n); 4145 if (!cusp->cooPerm_a) cusp->cooPerm_a = new THRUSTINTARRAY(n); 4146 4147 /* Ex. 4148 n = 6 4149 coo_i = [3,3,1,4,1,4] 4150 coo_j = [3,2,2,5,2,6] 4151 */ 4152 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i, d_j)); 4153 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i + n, d_j + n)); 4154 4155 PetscCall(PetscLogGpuTimeBegin()); 4156 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 4157 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 4158 (*cusp->cooPerm_a).assign(d_i, d_i + n); /* copy the sorted array */ 4159 THRUSTINTARRAY w(d_j, d_j + n); 4160 4161 /* 4162 d_i = [1,1,3,3,4,4] 4163 d_j = [2,2,2,3,5,6] 4164 cooPerm = [2,4,1,0,3,5] 4165 */ 4166 auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 4167 4168 /* 4169 d_i = [1,3,3,4,4,x] 4170 ^ekey 4171 d_j = [2,2,3,5,6,x] 4172 ^nekye 4173 */ 4174 if (nekey == ekey) { /* all entries are unique */ 4175 delete cusp->cooPerm_a; 4176 cusp->cooPerm_a = NULL; 4177 } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 4178 /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 4179 adjacent_difference(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 4180 adjacent_difference(w.begin(), w.end(), w.begin(), IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 4181 (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 4182 w[0] = 0; 4183 thrust::transform(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), w.begin(), cusp->cooPerm_a->begin(), IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 4184 thrust::inclusive_scan(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 4185 } 4186 thrust::counting_iterator<PetscInt> search_begin(0); 4187 thrust::upper_bound(d_i, nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 4188 search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 4189 ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 4190 PetscCall(PetscLogGpuTimeEnd()); 4191 4192 PetscCall(MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i)); 4193 a->singlemalloc = PETSC_FALSE; 4194 a->free_a = PETSC_TRUE; 4195 a->free_ij = PETSC_TRUE; 4196 PetscCall(PetscMalloc1(A->rmap->n + 1, &a->i)); 4197 a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 4198 PetscCallCUDA(cudaMemcpy(a->i + 1, ii.data().get(), A->rmap->n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4199 a->nz = a->maxnz = a->i[A->rmap->n]; 4200 a->rmax = 0; 4201 PetscCall(PetscMalloc1(a->nz, &a->a)); 4202 PetscCall(PetscMalloc1(a->nz, &a->j)); 4203 PetscCallCUDA(cudaMemcpy(a->j, thrust::raw_pointer_cast(d_j), a->nz * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4204 if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n, &a->ilen)); 4205 if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n, &a->imax)); 4206 for (PetscInt i = 0; i < A->rmap->n; i++) { 4207 const PetscInt nnzr = a->i[i + 1] - a->i[i]; 4208 nzr += (PetscInt) !!(nnzr); 4209 a->ilen[i] = a->imax[i] = nnzr; 4210 a->rmax = PetscMax(a->rmax, nnzr); 4211 } 4212 a->nonzerorowcnt = nzr; 4213 A->preallocated = PETSC_TRUE; 4214 PetscCall(PetscLogGpuToCpu((A->rmap->n + a->nz) * sizeof(PetscInt))); 4215 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 4216 if (free_raw_i) PetscCallCUDA(cudaFree(d_raw_i)); 4217 if (free_raw_j) PetscCallCUDA(cudaFree(d_raw_j)); 4218 } else { 4219 PetscCall(MatSeqAIJSetPreallocation(A, 0, NULL)); 4220 } 4221 PetscCall(MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE)); 4222 4223 /* We want to allocate the CUSPARSE struct for matvec now. 4224 The code is so convoluted now that I prefer to copy zeros */ 4225 PetscCall(PetscArrayzero(a->a, a->nz)); 4226 PetscCall(MatCheckCompressedRow(A, nzr, &a->compressedrow, a->i, A->rmap->n, 0.6)); 4227 A->offloadmask = PETSC_OFFLOAD_CPU; 4228 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4229 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 4230 PetscFunctionReturn(PETSC_SUCCESS); 4231 } 4232 4233 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 4234 { 4235 Mat_SeqAIJ *seq; 4236 Mat_SeqAIJCUSPARSE *dev; 4237 PetscBool coo_basic = PETSC_TRUE; 4238 PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 4239 4240 PetscFunctionBegin; 4241 PetscCall(MatResetPreallocationCOO_SeqAIJ(mat)); 4242 PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat)); 4243 if (coo_i) { 4244 PetscCall(PetscGetMemType(coo_i, &mtype)); 4245 if (PetscMemTypeHost(mtype)) { 4246 for (PetscCount k = 0; k < coo_n; k++) { 4247 if (coo_i[k] < 0 || coo_j[k] < 0) { 4248 coo_basic = PETSC_FALSE; 4249 break; 4250 } 4251 } 4252 } 4253 } 4254 4255 if (coo_basic) { /* i,j are on device or do not contain negative indices */ 4256 PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j)); 4257 } else { 4258 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 4259 mat->offloadmask = PETSC_OFFLOAD_CPU; 4260 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat)); 4261 seq = static_cast<Mat_SeqAIJ *>(mat->data); 4262 dev = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr); 4263 PetscCallCUDA(cudaMalloc((void **)&dev->jmap_d, (seq->nz + 1) * sizeof(PetscCount))); 4264 PetscCallCUDA(cudaMemcpy(dev->jmap_d, seq->jmap, (seq->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4265 PetscCallCUDA(cudaMalloc((void **)&dev->perm_d, seq->Atot * sizeof(PetscCount))); 4266 PetscCallCUDA(cudaMemcpy(dev->perm_d, seq->perm, seq->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4267 dev->use_extended_coo = PETSC_TRUE; 4268 } 4269 PetscFunctionReturn(PETSC_SUCCESS); 4270 } 4271 4272 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[]) 4273 { 4274 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 4275 const PetscCount grid_size = gridDim.x * blockDim.x; 4276 for (; i < nnz; i += grid_size) { 4277 PetscScalar sum = 0.0; 4278 for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]]; 4279 a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum; 4280 } 4281 } 4282 4283 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 4284 { 4285 Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data; 4286 Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE *)A->spptr; 4287 PetscCount Annz = seq->nz; 4288 PetscMemType memtype; 4289 const PetscScalar *v1 = v; 4290 PetscScalar *Aa; 4291 4292 PetscFunctionBegin; 4293 if (dev->use_extended_coo) { 4294 PetscCall(PetscGetMemType(v, &memtype)); 4295 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 4296 PetscCallCUDA(cudaMalloc((void **)&v1, seq->coo_n * sizeof(PetscScalar))); 4297 PetscCallCUDA(cudaMemcpy((void *)v1, v, seq->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4298 } 4299 4300 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); 4301 else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); 4302 4303 if (Annz) { 4304 MatAddCOOValues<<<(Annz + 255) / 256, 256>>>(v1, Annz, dev->jmap_d, dev->perm_d, imode, Aa); 4305 PetscCallCUDA(cudaPeekAtLastError()); 4306 } 4307 4308 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa)); 4309 else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa)); 4310 4311 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1)); 4312 } else { 4313 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A, v, imode)); 4314 } 4315 PetscFunctionReturn(PETSC_SUCCESS); 4316 } 4317 4318 /*@C 4319 MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for `MATSEQAIJCUSPARSE` matrices. 4320 4321 Not collective 4322 4323 Input Parameters: 4324 + A - the matrix 4325 - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 4326 4327 Output Parameters: 4328 + ia - the CSR row pointers 4329 - ja - the CSR column indices 4330 4331 Level: developer 4332 4333 Note: 4334 When compressed is true, the CSR structure does not contain empty rows 4335 4336 .seealso: `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()` 4337 @*/ 4338 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4339 { 4340 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4341 CsrMatrix *csr; 4342 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4343 4344 PetscFunctionBegin; 4345 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4346 if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS); 4347 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4348 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4349 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4350 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4351 csr = (CsrMatrix *)cusp->mat->mat; 4352 if (i) { 4353 if (!compressed && a->compressedrow.use) { /* need full row offset */ 4354 if (!cusp->rowoffsets_gpu) { 4355 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4356 cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 4357 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4358 } 4359 *i = cusp->rowoffsets_gpu->data().get(); 4360 } else *i = csr->row_offsets->data().get(); 4361 } 4362 if (j) *j = csr->column_indices->data().get(); 4363 PetscFunctionReturn(PETSC_SUCCESS); 4364 } 4365 4366 /*@C 4367 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with `MatSeqAIJCUSPARSEGetIJ()` 4368 4369 Not collective 4370 4371 Input Parameters: 4372 + A - the matrix 4373 - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 4374 4375 Output Parameters: 4376 + ia - the CSR row pointers 4377 - ja - the CSR column indices 4378 4379 Level: developer 4380 4381 .seealso: `MatSeqAIJCUSPARSEGetIJ()` 4382 @*/ 4383 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool, const int **i, const int **j) 4384 { 4385 PetscFunctionBegin; 4386 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4387 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4388 if (i) *i = NULL; 4389 if (j) *j = NULL; 4390 PetscFunctionReturn(PETSC_SUCCESS); 4391 } 4392 4393 /*@C 4394 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4395 4396 Not Collective 4397 4398 Input Parameter: 4399 . A - a `MATSEQAIJCUSPARSE` matrix 4400 4401 Output Parameter: 4402 . a - pointer to the device data 4403 4404 Level: developer 4405 4406 Note: 4407 May trigger host-device copies if up-to-date matrix data is on host 4408 4409 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()` 4410 @*/ 4411 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a) 4412 { 4413 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4414 CsrMatrix *csr; 4415 4416 PetscFunctionBegin; 4417 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4418 PetscValidPointer(a, 2); 4419 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4420 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4421 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4422 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4423 csr = (CsrMatrix *)cusp->mat->mat; 4424 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4425 *a = csr->values->data().get(); 4426 PetscFunctionReturn(PETSC_SUCCESS); 4427 } 4428 4429 /*@C 4430 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()` 4431 4432 Not Collective 4433 4434 Input Parameter: 4435 . A - a `MATSEQAIJCUSPARSE` matrix 4436 4437 Output Parameter: 4438 . a - pointer to the device data 4439 4440 Level: developer 4441 4442 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()` 4443 @*/ 4444 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a) 4445 { 4446 PetscFunctionBegin; 4447 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4448 PetscValidPointer(a, 2); 4449 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4450 *a = NULL; 4451 PetscFunctionReturn(PETSC_SUCCESS); 4452 } 4453 4454 /*@C 4455 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4456 4457 Not Collective 4458 4459 Input Parameter: 4460 . A - a `MATSEQAIJCUSPARSE` matrix 4461 4462 Output Parameter: 4463 . a - pointer to the device data 4464 4465 Level: developer 4466 4467 Note: 4468 May trigger host-device copies if up-to-date matrix data is on host 4469 4470 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()` 4471 @*/ 4472 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a) 4473 { 4474 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4475 CsrMatrix *csr; 4476 4477 PetscFunctionBegin; 4478 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4479 PetscValidPointer(a, 2); 4480 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4481 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4482 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4483 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4484 csr = (CsrMatrix *)cusp->mat->mat; 4485 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4486 *a = csr->values->data().get(); 4487 A->offloadmask = PETSC_OFFLOAD_GPU; 4488 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4489 PetscFunctionReturn(PETSC_SUCCESS); 4490 } 4491 /*@C 4492 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()` 4493 4494 Not Collective 4495 4496 Input Parameter: 4497 . A - a `MATSEQAIJCUSPARSE` matrix 4498 4499 Output Parameter: 4500 . a - pointer to the device data 4501 4502 Level: developer 4503 4504 .seealso: `MatSeqAIJCUSPARSEGetArray()` 4505 @*/ 4506 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a) 4507 { 4508 PetscFunctionBegin; 4509 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4510 PetscValidPointer(a, 2); 4511 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4512 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4513 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4514 *a = NULL; 4515 PetscFunctionReturn(PETSC_SUCCESS); 4516 } 4517 4518 /*@C 4519 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4520 4521 Not Collective 4522 4523 Input Parameter: 4524 . A - a `MATSEQAIJCUSPARSE` matrix 4525 4526 Output Parameter: 4527 . a - pointer to the device data 4528 4529 Level: developer 4530 4531 Note: 4532 Does not trigger host-device copies and flags data validity on the GPU 4533 4534 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()` 4535 @*/ 4536 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a) 4537 { 4538 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4539 CsrMatrix *csr; 4540 4541 PetscFunctionBegin; 4542 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4543 PetscValidPointer(a, 2); 4544 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4545 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4546 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4547 csr = (CsrMatrix *)cusp->mat->mat; 4548 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4549 *a = csr->values->data().get(); 4550 A->offloadmask = PETSC_OFFLOAD_GPU; 4551 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4552 PetscFunctionReturn(PETSC_SUCCESS); 4553 } 4554 4555 /*@C 4556 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()` 4557 4558 Not Collective 4559 4560 Input Parameter: 4561 . A - a `MATSEQAIJCUSPARSE` matrix 4562 4563 Output Parameter: 4564 . a - pointer to the device data 4565 4566 Level: developer 4567 4568 .seealso: `MatSeqAIJCUSPARSEGetArrayWrite()` 4569 @*/ 4570 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a) 4571 { 4572 PetscFunctionBegin; 4573 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4574 PetscValidPointer(a, 2); 4575 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4576 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4577 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4578 *a = NULL; 4579 PetscFunctionReturn(PETSC_SUCCESS); 4580 } 4581 4582 struct IJCompare4 { 4583 __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4584 { 4585 if (t1.get<0>() < t2.get<0>()) return true; 4586 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4587 return false; 4588 } 4589 }; 4590 4591 struct Shift { 4592 int _shift; 4593 4594 Shift(int shift) : _shift(shift) { } 4595 __host__ __device__ inline int operator()(const int &c) { return c + _shift; } 4596 }; 4597 4598 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 4599 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 4600 { 4601 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 4602 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp; 4603 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4604 CsrMatrix *Acsr, *Bcsr, *Ccsr; 4605 PetscInt Annz, Bnnz; 4606 cusparseStatus_t stat; 4607 PetscInt i, m, n, zero = 0; 4608 4609 PetscFunctionBegin; 4610 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4611 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 4612 PetscValidPointer(C, 4); 4613 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4614 PetscCheckTypeName(B, MATSEQAIJCUSPARSE); 4615 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 4616 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 4617 PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4618 PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4619 if (reuse == MAT_INITIAL_MATRIX) { 4620 m = A->rmap->n; 4621 n = A->cmap->n + B->cmap->n; 4622 PetscCall(MatCreate(PETSC_COMM_SELF, C)); 4623 PetscCall(MatSetSizes(*C, m, n, m, n)); 4624 PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE)); 4625 c = (Mat_SeqAIJ *)(*C)->data; 4626 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4627 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4628 Ccsr = new CsrMatrix; 4629 Cmat->cprowIndices = NULL; 4630 c->compressedrow.use = PETSC_FALSE; 4631 c->compressedrow.nrows = 0; 4632 c->compressedrow.i = NULL; 4633 c->compressedrow.rindex = NULL; 4634 Ccusp->workVector = NULL; 4635 Ccusp->nrows = m; 4636 Ccusp->mat = Cmat; 4637 Ccusp->mat->mat = Ccsr; 4638 Ccsr->num_rows = m; 4639 Ccsr->num_cols = n; 4640 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 4641 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 4642 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4643 PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar))); 4644 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar))); 4645 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 4646 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4647 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4648 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4649 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4650 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4651 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4652 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4653 4654 Acsr = (CsrMatrix *)Acusp->mat->mat; 4655 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4656 Annz = (PetscInt)Acsr->column_indices->size(); 4657 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4658 c->nz = Annz + Bnnz; 4659 Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1); 4660 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4661 Ccsr->values = new THRUSTARRAY(c->nz); 4662 Ccsr->num_entries = c->nz; 4663 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4664 if (c->nz) { 4665 auto Acoo = new THRUSTINTARRAY32(Annz); 4666 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4667 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4668 THRUSTINTARRAY32 *Aroff, *Broff; 4669 4670 if (a->compressedrow.use) { /* need full row offset */ 4671 if (!Acusp->rowoffsets_gpu) { 4672 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4673 Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 4674 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4675 } 4676 Aroff = Acusp->rowoffsets_gpu; 4677 } else Aroff = Acsr->row_offsets; 4678 if (b->compressedrow.use) { /* need full row offset */ 4679 if (!Bcusp->rowoffsets_gpu) { 4680 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4681 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 4682 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 4683 } 4684 Broff = Bcusp->rowoffsets_gpu; 4685 } else Broff = Bcsr->row_offsets; 4686 PetscCall(PetscLogGpuTimeBegin()); 4687 stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4688 PetscCallCUSPARSE(stat); 4689 stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4690 PetscCallCUSPARSE(stat); 4691 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4692 auto Aperm = thrust::make_constant_iterator(1); 4693 auto Bperm = thrust::make_constant_iterator(0); 4694 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0) 4695 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n)); 4696 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n)); 4697 #else 4698 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4699 auto Bcib = Bcsr->column_indices->begin(); 4700 auto Bcie = Bcsr->column_indices->end(); 4701 thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n)); 4702 #endif 4703 auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz); 4704 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm)); 4705 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm)); 4706 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm)); 4707 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm)); 4708 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin())); 4709 auto p1 = Ccusp->cooPerm->begin(); 4710 auto p2 = Ccusp->cooPerm->begin(); 4711 thrust::advance(p2, Annz); 4712 PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4())); 4713 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0) 4714 thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n)); 4715 #endif 4716 auto cci = thrust::make_counting_iterator(zero); 4717 auto cce = thrust::make_counting_iterator(c->nz); 4718 #if 0 //Errors on SUMMIT cuda 11.1.0 4719 PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4720 #else 4721 auto pred = thrust::identity<int>(); 4722 PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred)); 4723 PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred)); 4724 #endif 4725 stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4726 PetscCallCUSPARSE(stat); 4727 PetscCall(PetscLogGpuTimeEnd()); 4728 delete wPerm; 4729 delete Acoo; 4730 delete Bcoo; 4731 delete Ccoo; 4732 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4733 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 4734 PetscCallCUSPARSE(stat); 4735 #endif 4736 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4737 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 4738 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 4739 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4740 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4741 CsrMatrix *CcsrT = new CsrMatrix; 4742 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4743 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4744 4745 (*C)->form_explicit_transpose = PETSC_TRUE; 4746 (*C)->transupdated = PETSC_TRUE; 4747 Ccusp->rowoffsets_gpu = NULL; 4748 CmatT->cprowIndices = NULL; 4749 CmatT->mat = CcsrT; 4750 CcsrT->num_rows = n; 4751 CcsrT->num_cols = m; 4752 CcsrT->num_entries = c->nz; 4753 4754 CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1); 4755 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4756 CcsrT->values = new THRUSTARRAY(c->nz); 4757 4758 PetscCall(PetscLogGpuTimeBegin()); 4759 auto rT = CcsrT->row_offsets->begin(); 4760 if (AT) { 4761 rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT); 4762 thrust::advance(rT, -1); 4763 } 4764 if (BT) { 4765 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz)); 4766 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz)); 4767 thrust::copy(titb, tite, rT); 4768 } 4769 auto cT = CcsrT->column_indices->begin(); 4770 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT); 4771 if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT); 4772 auto vT = CcsrT->values->begin(); 4773 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4774 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 4775 PetscCall(PetscLogGpuTimeEnd()); 4776 4777 PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr)); 4778 PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO)); 4779 PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4780 PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one), sizeof(PetscScalar))); 4781 PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero), sizeof(PetscScalar))); 4782 PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar))); 4783 PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4784 PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4785 PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4786 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4787 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 4788 PetscCallCUSPARSE(stat); 4789 #endif 4790 Ccusp->matTranspose = CmatT; 4791 } 4792 } 4793 4794 c->singlemalloc = PETSC_FALSE; 4795 c->free_a = PETSC_TRUE; 4796 c->free_ij = PETSC_TRUE; 4797 PetscCall(PetscMalloc1(m + 1, &c->i)); 4798 PetscCall(PetscMalloc1(c->nz, &c->j)); 4799 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4800 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4801 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4802 ii = *Ccsr->row_offsets; 4803 jj = *Ccsr->column_indices; 4804 PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4805 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4806 } else { 4807 PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4808 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4809 } 4810 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 4811 PetscCall(PetscMalloc1(m, &c->ilen)); 4812 PetscCall(PetscMalloc1(m, &c->imax)); 4813 c->maxnz = c->nz; 4814 c->nonzerorowcnt = 0; 4815 c->rmax = 0; 4816 for (i = 0; i < m; i++) { 4817 const PetscInt nn = c->i[i + 1] - c->i[i]; 4818 c->ilen[i] = c->imax[i] = nn; 4819 c->nonzerorowcnt += (PetscInt) !!nn; 4820 c->rmax = PetscMax(c->rmax, nn); 4821 } 4822 PetscCall(MatMarkDiagonal_SeqAIJ(*C)); 4823 PetscCall(PetscMalloc1(c->nz, &c->a)); 4824 (*C)->nonzerostate++; 4825 PetscCall(PetscLayoutSetUp((*C)->rmap)); 4826 PetscCall(PetscLayoutSetUp((*C)->cmap)); 4827 Ccusp->nonzerostate = (*C)->nonzerostate; 4828 (*C)->preallocated = PETSC_TRUE; 4829 } else { 4830 PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n); 4831 c = (Mat_SeqAIJ *)(*C)->data; 4832 if (c->nz) { 4833 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4834 PetscCheck(Ccusp->cooPerm, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cooPerm"); 4835 PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4836 PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate"); 4837 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4838 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4839 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4840 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4841 Acsr = (CsrMatrix *)Acusp->mat->mat; 4842 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4843 Ccsr = (CsrMatrix *)Ccusp->mat->mat; 4844 PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size()); 4845 PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size()); 4846 PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size()); 4847 PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries); 4848 PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->cooPerm->size(), (PetscInt)Ccsr->values->size()); 4849 auto pmid = Ccusp->cooPerm->begin(); 4850 thrust::advance(pmid, Acsr->num_entries); 4851 PetscCall(PetscLogGpuTimeBegin()); 4852 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->begin()))); 4853 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4854 thrust::for_each(zibait, zieait, VecCUDAEquals()); 4855 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4856 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->end()))); 4857 thrust::for_each(zibbit, ziebit, VecCUDAEquals()); 4858 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE)); 4859 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4860 PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4861 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4862 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4863 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4864 CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat; 4865 auto vT = CcsrT->values->begin(); 4866 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4867 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 4868 (*C)->transupdated = PETSC_TRUE; 4869 } 4870 PetscCall(PetscLogGpuTimeEnd()); 4871 } 4872 } 4873 PetscCall(PetscObjectStateIncrease((PetscObject)*C)); 4874 (*C)->assembled = PETSC_TRUE; 4875 (*C)->was_assembled = PETSC_FALSE; 4876 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4877 PetscFunctionReturn(PETSC_SUCCESS); 4878 } 4879 4880 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4881 { 4882 bool dmem; 4883 const PetscScalar *av; 4884 4885 PetscFunctionBegin; 4886 dmem = isCudaMem(v); 4887 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av)); 4888 if (n && idx) { 4889 THRUSTINTARRAY widx(n); 4890 widx.assign(idx, idx + n); 4891 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 4892 4893 THRUSTARRAY *w = NULL; 4894 thrust::device_ptr<PetscScalar> dv; 4895 if (dmem) { 4896 dv = thrust::device_pointer_cast(v); 4897 } else { 4898 w = new THRUSTARRAY(n); 4899 dv = w->data(); 4900 } 4901 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4902 4903 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv)); 4904 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n)); 4905 thrust::for_each(zibit, zieit, VecCUDAEquals()); 4906 if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 4907 delete w; 4908 } else { 4909 PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost)); 4910 } 4911 if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 4912 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av)); 4913 PetscFunctionReturn(PETSC_SUCCESS); 4914 } 4915