1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 7 8 #include <petscconf.h> 9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10 #include <../src/mat/impls/sbaij/seq/sbaij.h> 11 #include <../src/vec/vec/impls/dvecimpl.h> 12 #include <petsc/private/vecimpl.h> 13 #undef VecType 14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15 #include <thrust/adjacent_difference.h> 16 #include <thrust/async/for_each.h> 17 #include <thrust/iterator/constant_iterator.h> 18 #include <thrust/remove.h> 19 #include <thrust/sort.h> 20 #include <thrust/unique.h> 21 22 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 23 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 24 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 25 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 26 27 typedef enum { 28 CUSPARSE_MV_ALG_DEFAULT = 0, 29 CUSPARSE_COOMV_ALG = 1, 30 CUSPARSE_CSRMV_ALG1 = 2, 31 CUSPARSE_CSRMV_ALG2 = 3 32 } cusparseSpMVAlg_t; 33 34 typedef enum { 35 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 36 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 37 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 38 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 39 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 40 CUSPARSE_SPMM_ALG_DEFAULT = 0, 41 CUSPARSE_SPMM_COO_ALG1 = 1, 42 CUSPARSE_SPMM_COO_ALG2 = 2, 43 CUSPARSE_SPMM_COO_ALG3 = 3, 44 CUSPARSE_SPMM_COO_ALG4 = 5, 45 CUSPARSE_SPMM_CSR_ALG1 = 4, 46 CUSPARSE_SPMM_CSR_ALG2 = 6, 47 } cusparseSpMMAlg_t; 48 49 typedef enum { 50 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 51 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 52 } cusparseCsr2CscAlg_t; 53 */ 54 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 55 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 56 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 57 #endif 58 59 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 60 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 61 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 62 63 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 64 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 65 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 66 67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 71 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 72 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 73 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 74 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 75 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 76 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 77 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 78 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 79 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 80 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 81 82 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 83 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 84 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 85 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 86 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 87 88 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 89 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 90 91 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 92 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscCount,const PetscInt[],const PetscInt[]); 93 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 94 95 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 96 { 97 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 98 99 PetscFunctionBegin; 100 switch (op) { 101 case MAT_CUSPARSE_MULT: 102 cusparsestruct->format = format; 103 break; 104 case MAT_CUSPARSE_ALL: 105 cusparsestruct->format = format; 106 break; 107 default: 108 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 109 } 110 PetscFunctionReturn(0); 111 } 112 113 /*@ 114 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 115 operation. Only the MatMult operation can use different GPU storage formats 116 for MPIAIJCUSPARSE matrices. 117 Not Collective 118 119 Input Parameters: 120 + A - Matrix of type SEQAIJCUSPARSE 121 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 122 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 123 124 Output Parameter: 125 126 Level: intermediate 127 128 .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 129 @*/ 130 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 131 { 132 PetscFunctionBegin; 133 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 134 PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format)); 135 PetscFunctionReturn(0); 136 } 137 138 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu) 139 { 140 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 141 142 PetscFunctionBegin; 143 cusparsestruct->use_cpu_solve = use_cpu; 144 PetscFunctionReturn(0); 145 } 146 147 /*@ 148 MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve. 149 150 Input Parameters: 151 + A - Matrix of type SEQAIJCUSPARSE 152 - use_cpu - set flag for using the built-in CPU MatSolve 153 154 Output Parameter: 155 156 Notes: 157 The cuSparse LU solver currently computes the factors with the built-in CPU method 158 and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 159 This method to specify if the solve is done on the CPU or GPU (GPU is the default). 160 161 Level: intermediate 162 163 .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 164 @*/ 165 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu) 166 { 167 PetscFunctionBegin; 168 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 169 PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu)); 170 PetscFunctionReturn(0); 171 } 172 173 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 174 { 175 PetscFunctionBegin; 176 switch (op) { 177 case MAT_FORM_EXPLICIT_TRANSPOSE: 178 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 179 if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 180 A->form_explicit_transpose = flg; 181 break; 182 default: 183 PetscCall(MatSetOption_SeqAIJ(A,op,flg)); 184 break; 185 } 186 PetscFunctionReturn(0); 187 } 188 189 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 190 191 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 192 { 193 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 194 IS isrow = b->row,iscol = b->col; 195 PetscBool row_identity,col_identity; 196 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr; 197 198 PetscFunctionBegin; 199 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 200 PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info)); 201 B->offloadmask = PETSC_OFFLOAD_CPU; 202 /* determine which version of MatSolve needs to be used. */ 203 PetscCall(ISIdentity(isrow,&row_identity)); 204 PetscCall(ISIdentity(iscol,&col_identity)); 205 206 if (!cusparsestruct->use_cpu_solve) { 207 if (row_identity && col_identity) { 208 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 209 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 210 } else { 211 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 212 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 213 } 214 } 215 B->ops->matsolve = NULL; 216 B->ops->matsolvetranspose = NULL; 217 218 /* get the triangular factors */ 219 if (!cusparsestruct->use_cpu_solve) { 220 PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B)); 221 } 222 PetscFunctionReturn(0); 223 } 224 225 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 226 { 227 MatCUSPARSEStorageFormat format; 228 PetscBool flg; 229 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 230 231 PetscFunctionBegin; 232 PetscOptionsHeadBegin(PetscOptionsObject,"SeqAIJCUSPARSE options"); 233 if (A->factortype == MAT_FACTOR_NONE) { 234 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 235 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg)); 236 if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format)); 237 238 PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 239 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg)); 240 if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format)); 241 PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg)); 242 if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve)); 243 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 244 PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 245 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg)); 246 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 247 #if CUSPARSE_VERSION > 11301 248 PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 249 #else 250 PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 251 #endif 252 PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 253 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg)); 254 PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 255 256 PetscCall(PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 257 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg)); 258 PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 259 #endif 260 } 261 PetscOptionsHeadEnd(); 262 PetscFunctionReturn(0); 263 } 264 265 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 266 { 267 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 268 PetscInt n = A->rmap->n; 269 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 270 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 271 const PetscInt *ai = a->i,*aj = a->j,*vi; 272 const MatScalar *aa = a->a,*v; 273 PetscInt *AiLo, *AjLo; 274 PetscInt i,nz, nzLower, offset, rowOffset; 275 276 PetscFunctionBegin; 277 if (!n) PetscFunctionReturn(0); 278 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 279 try { 280 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 281 nzLower=n+ai[n]-ai[1]; 282 if (!loTriFactor) { 283 PetscScalar *AALo; 284 285 PetscCallCUDA(cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar))); 286 287 /* Allocate Space for the lower triangular matrix */ 288 PetscCallCUDA(cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt))); 289 PetscCallCUDA(cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt))); 290 291 /* Fill the lower triangular matrix */ 292 AiLo[0] = (PetscInt) 0; 293 AiLo[n] = nzLower; 294 AjLo[0] = (PetscInt) 0; 295 AALo[0] = (MatScalar) 1.0; 296 v = aa; 297 vi = aj; 298 offset = 1; 299 rowOffset= 1; 300 for (i=1; i<n; i++) { 301 nz = ai[i+1] - ai[i]; 302 /* additional 1 for the term on the diagonal */ 303 AiLo[i] = rowOffset; 304 rowOffset += nz+1; 305 306 PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz)); 307 PetscCall(PetscArraycpy(&(AALo[offset]), v, nz)); 308 309 offset += nz; 310 AjLo[offset] = (PetscInt) i; 311 AALo[offset] = (MatScalar) 1.0; 312 offset += 1; 313 314 v += nz; 315 vi += nz; 316 } 317 318 /* allocate space for the triangular factor information */ 319 PetscCall(PetscNew(&loTriFactor)); 320 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 321 /* Create the matrix description */ 322 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 323 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 324 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 325 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 326 #else 327 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 328 #endif 329 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER)); 330 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 331 332 /* set the operation */ 333 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 334 335 /* set the matrix */ 336 loTriFactor->csrMat = new CsrMatrix; 337 loTriFactor->csrMat->num_rows = n; 338 loTriFactor->csrMat->num_cols = n; 339 loTriFactor->csrMat->num_entries = nzLower; 340 341 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 342 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 343 344 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 345 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 346 347 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 348 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 349 350 /* Create the solve analysis information */ 351 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 352 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 353 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 354 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 355 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 356 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 357 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 358 &loTriFactor->solveBufferSize)); 359 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize)); 360 #endif 361 362 /* perform the solve analysis */ 363 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 364 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 365 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 366 loTriFactor->csrMat->column_indices->data().get(), 367 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 368 loTriFactor->solveInfo, 369 loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 370 #else 371 loTriFactor->solveInfo)); 372 #endif 373 PetscCallCUDA(WaitForCUDA()); 374 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 375 376 /* assign the pointer */ 377 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 378 loTriFactor->AA_h = AALo; 379 PetscCallCUDA(cudaFreeHost(AiLo)); 380 PetscCallCUDA(cudaFreeHost(AjLo)); 381 PetscCall(PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar))); 382 } else { /* update values only */ 383 if (!loTriFactor->AA_h) { 384 PetscCallCUDA(cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar))); 385 } 386 /* Fill the lower triangular matrix */ 387 loTriFactor->AA_h[0] = 1.0; 388 v = aa; 389 vi = aj; 390 offset = 1; 391 for (i=1; i<n; i++) { 392 nz = ai[i+1] - ai[i]; 393 PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz)); 394 offset += nz; 395 loTriFactor->AA_h[offset] = 1.0; 396 offset += 1; 397 v += nz; 398 } 399 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 400 PetscCall(PetscLogCpuToGpu(nzLower*sizeof(PetscScalar))); 401 } 402 } catch(char *ex) { 403 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 404 } 405 } 406 PetscFunctionReturn(0); 407 } 408 409 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 410 { 411 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 412 PetscInt n = A->rmap->n; 413 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 414 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 415 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 416 const MatScalar *aa = a->a,*v; 417 PetscInt *AiUp, *AjUp; 418 PetscInt i,nz, nzUpper, offset; 419 420 PetscFunctionBegin; 421 if (!n) PetscFunctionReturn(0); 422 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 423 try { 424 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 425 nzUpper = adiag[0]-adiag[n]; 426 if (!upTriFactor) { 427 PetscScalar *AAUp; 428 429 PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar))); 430 431 /* Allocate Space for the upper triangular matrix */ 432 PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt))); 433 PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt))); 434 435 /* Fill the upper triangular matrix */ 436 AiUp[0]=(PetscInt) 0; 437 AiUp[n]=nzUpper; 438 offset = nzUpper; 439 for (i=n-1; i>=0; i--) { 440 v = aa + adiag[i+1] + 1; 441 vi = aj + adiag[i+1] + 1; 442 443 /* number of elements NOT on the diagonal */ 444 nz = adiag[i] - adiag[i+1]-1; 445 446 /* decrement the offset */ 447 offset -= (nz+1); 448 449 /* first, set the diagonal elements */ 450 AjUp[offset] = (PetscInt) i; 451 AAUp[offset] = (MatScalar)1./v[nz]; 452 AiUp[i] = AiUp[i+1] - (nz+1); 453 454 PetscCall(PetscArraycpy(&(AjUp[offset+1]), vi, nz)); 455 PetscCall(PetscArraycpy(&(AAUp[offset+1]), v, nz)); 456 } 457 458 /* allocate space for the triangular factor information */ 459 PetscCall(PetscNew(&upTriFactor)); 460 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 461 462 /* Create the matrix description */ 463 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 464 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 465 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 466 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 467 #else 468 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 469 #endif 470 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 471 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 472 473 /* set the operation */ 474 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 475 476 /* set the matrix */ 477 upTriFactor->csrMat = new CsrMatrix; 478 upTriFactor->csrMat->num_rows = n; 479 upTriFactor->csrMat->num_cols = n; 480 upTriFactor->csrMat->num_entries = nzUpper; 481 482 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 483 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 484 485 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 486 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 487 488 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 489 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 490 491 /* Create the solve analysis information */ 492 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 493 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 494 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 495 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 496 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 497 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 498 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 499 &upTriFactor->solveBufferSize)); 500 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize)); 501 #endif 502 503 /* perform the solve analysis */ 504 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 505 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 506 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 507 upTriFactor->csrMat->column_indices->data().get(), 508 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 509 upTriFactor->solveInfo, 510 upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 511 #else 512 upTriFactor->solveInfo)); 513 #endif 514 PetscCallCUDA(WaitForCUDA()); 515 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 516 517 /* assign the pointer */ 518 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 519 upTriFactor->AA_h = AAUp; 520 PetscCallCUDA(cudaFreeHost(AiUp)); 521 PetscCallCUDA(cudaFreeHost(AjUp)); 522 PetscCall(PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar))); 523 } else { 524 if (!upTriFactor->AA_h) { 525 PetscCallCUDA(cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar))); 526 } 527 /* Fill the upper triangular matrix */ 528 offset = nzUpper; 529 for (i=n-1; i>=0; i--) { 530 v = aa + adiag[i+1] + 1; 531 532 /* number of elements NOT on the diagonal */ 533 nz = adiag[i] - adiag[i+1]-1; 534 535 /* decrement the offset */ 536 offset -= (nz+1); 537 538 /* first, set the diagonal elements */ 539 upTriFactor->AA_h[offset] = 1./v[nz]; 540 PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz)); 541 } 542 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 543 PetscCall(PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar))); 544 } 545 } catch(char *ex) { 546 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 547 } 548 } 549 PetscFunctionReturn(0); 550 } 551 552 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 553 { 554 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 555 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 556 IS isrow = a->row,iscol = a->icol; 557 PetscBool row_identity,col_identity; 558 PetscInt n = A->rmap->n; 559 560 PetscFunctionBegin; 561 PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 562 PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A)); 563 PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A)); 564 565 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 566 cusparseTriFactors->nnz=a->nz; 567 568 A->offloadmask = PETSC_OFFLOAD_BOTH; 569 /* lower triangular indices */ 570 PetscCall(ISIdentity(isrow,&row_identity)); 571 if (!row_identity && !cusparseTriFactors->rpermIndices) { 572 const PetscInt *r; 573 574 PetscCall(ISGetIndices(isrow,&r)); 575 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 576 cusparseTriFactors->rpermIndices->assign(r, r+n); 577 PetscCall(ISRestoreIndices(isrow,&r)); 578 PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt))); 579 } 580 581 /* upper triangular indices */ 582 PetscCall(ISIdentity(iscol,&col_identity)); 583 if (!col_identity && !cusparseTriFactors->cpermIndices) { 584 const PetscInt *c; 585 586 PetscCall(ISGetIndices(iscol,&c)); 587 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 588 cusparseTriFactors->cpermIndices->assign(c, c+n); 589 PetscCall(ISRestoreIndices(iscol,&c)); 590 PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt))); 591 } 592 PetscFunctionReturn(0); 593 } 594 595 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 596 { 597 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 598 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 599 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 600 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 601 PetscInt *AiUp, *AjUp; 602 PetscScalar *AAUp; 603 PetscScalar *AALo; 604 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 605 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 606 const PetscInt *ai = b->i,*aj = b->j,*vj; 607 const MatScalar *aa = b->a,*v; 608 609 PetscFunctionBegin; 610 if (!n) PetscFunctionReturn(0); 611 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 612 try { 613 PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar))); 614 PetscCallCUDA(cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar))); 615 if (!upTriFactor && !loTriFactor) { 616 /* Allocate Space for the upper triangular matrix */ 617 PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt))); 618 PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt))); 619 620 /* Fill the upper triangular matrix */ 621 AiUp[0]=(PetscInt) 0; 622 AiUp[n]=nzUpper; 623 offset = 0; 624 for (i=0; i<n; i++) { 625 /* set the pointers */ 626 v = aa + ai[i]; 627 vj = aj + ai[i]; 628 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 629 630 /* first, set the diagonal elements */ 631 AjUp[offset] = (PetscInt) i; 632 AAUp[offset] = (MatScalar)1.0/v[nz]; 633 AiUp[i] = offset; 634 AALo[offset] = (MatScalar)1.0/v[nz]; 635 636 offset+=1; 637 if (nz>0) { 638 PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz)); 639 PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 640 for (j=offset; j<offset+nz; j++) { 641 AAUp[j] = -AAUp[j]; 642 AALo[j] = AAUp[j]/v[nz]; 643 } 644 offset+=nz; 645 } 646 } 647 648 /* allocate space for the triangular factor information */ 649 PetscCall(PetscNew(&upTriFactor)); 650 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 651 652 /* Create the matrix description */ 653 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 654 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 655 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 656 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 657 #else 658 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 659 #endif 660 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 661 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 662 663 /* set the matrix */ 664 upTriFactor->csrMat = new CsrMatrix; 665 upTriFactor->csrMat->num_rows = A->rmap->n; 666 upTriFactor->csrMat->num_cols = A->cmap->n; 667 upTriFactor->csrMat->num_entries = a->nz; 668 669 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 670 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 671 672 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 673 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 674 675 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 676 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 677 678 /* set the operation */ 679 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 680 681 /* Create the solve analysis information */ 682 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 683 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 684 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 685 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 686 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 687 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 688 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 689 &upTriFactor->solveBufferSize)); 690 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize)); 691 #endif 692 693 /* perform the solve analysis */ 694 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 695 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 696 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 697 upTriFactor->csrMat->column_indices->data().get(), 698 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 699 upTriFactor->solveInfo, 700 upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 701 #else 702 upTriFactor->solveInfo)); 703 #endif 704 PetscCallCUDA(WaitForCUDA()); 705 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 706 707 /* assign the pointer */ 708 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 709 710 /* allocate space for the triangular factor information */ 711 PetscCall(PetscNew(&loTriFactor)); 712 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 713 714 /* Create the matrix description */ 715 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 716 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 717 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 718 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 719 #else 720 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 721 #endif 722 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 723 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 724 725 /* set the operation */ 726 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 727 728 /* set the matrix */ 729 loTriFactor->csrMat = new CsrMatrix; 730 loTriFactor->csrMat->num_rows = A->rmap->n; 731 loTriFactor->csrMat->num_cols = A->cmap->n; 732 loTriFactor->csrMat->num_entries = a->nz; 733 734 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 735 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 736 737 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 738 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 739 740 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 741 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 742 743 /* Create the solve analysis information */ 744 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 745 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 746 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 747 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 748 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 749 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 750 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 751 &loTriFactor->solveBufferSize)); 752 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize)); 753 #endif 754 755 /* perform the solve analysis */ 756 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 757 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 758 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 759 loTriFactor->csrMat->column_indices->data().get(), 760 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 761 loTriFactor->solveInfo, 762 loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 763 #else 764 loTriFactor->solveInfo)); 765 #endif 766 PetscCallCUDA(WaitForCUDA()); 767 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 768 769 /* assign the pointer */ 770 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 771 772 PetscCall(PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))); 773 PetscCallCUDA(cudaFreeHost(AiUp)); 774 PetscCallCUDA(cudaFreeHost(AjUp)); 775 } else { 776 /* Fill the upper triangular matrix */ 777 offset = 0; 778 for (i=0; i<n; i++) { 779 /* set the pointers */ 780 v = aa + ai[i]; 781 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 782 783 /* first, set the diagonal elements */ 784 AAUp[offset] = 1.0/v[nz]; 785 AALo[offset] = 1.0/v[nz]; 786 787 offset+=1; 788 if (nz>0) { 789 PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 790 for (j=offset; j<offset+nz; j++) { 791 AAUp[j] = -AAUp[j]; 792 AALo[j] = AAUp[j]/v[nz]; 793 } 794 offset+=nz; 795 } 796 } 797 PetscCheck(upTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 798 PetscCheck(loTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 799 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 800 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 801 PetscCall(PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar))); 802 } 803 PetscCallCUDA(cudaFreeHost(AAUp)); 804 PetscCallCUDA(cudaFreeHost(AALo)); 805 } catch(char *ex) { 806 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 807 } 808 } 809 PetscFunctionReturn(0); 810 } 811 812 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 813 { 814 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 815 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 816 IS ip = a->row; 817 PetscBool perm_identity; 818 PetscInt n = A->rmap->n; 819 820 PetscFunctionBegin; 821 PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 822 PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A)); 823 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 824 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 825 826 A->offloadmask = PETSC_OFFLOAD_BOTH; 827 828 /* lower triangular indices */ 829 PetscCall(ISIdentity(ip,&perm_identity)); 830 if (!perm_identity) { 831 IS iip; 832 const PetscInt *irip,*rip; 833 834 PetscCall(ISInvertPermutation(ip,PETSC_DECIDE,&iip)); 835 PetscCall(ISGetIndices(iip,&irip)); 836 PetscCall(ISGetIndices(ip,&rip)); 837 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 838 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 839 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 840 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 841 PetscCall(ISRestoreIndices(iip,&irip)); 842 PetscCall(ISDestroy(&iip)); 843 PetscCall(ISRestoreIndices(ip,&rip)); 844 PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt))); 845 } 846 PetscFunctionReturn(0); 847 } 848 849 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 850 { 851 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 852 IS ip = b->row; 853 PetscBool perm_identity; 854 855 PetscFunctionBegin; 856 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 857 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B,A,info)); 858 B->offloadmask = PETSC_OFFLOAD_CPU; 859 /* determine which version of MatSolve needs to be used. */ 860 PetscCall(ISIdentity(ip,&perm_identity)); 861 if (perm_identity) { 862 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 863 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 864 B->ops->matsolve = NULL; 865 B->ops->matsolvetranspose = NULL; 866 } else { 867 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 868 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 869 B->ops->matsolve = NULL; 870 B->ops->matsolvetranspose = NULL; 871 } 872 873 /* get the triangular factors */ 874 PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B)); 875 PetscFunctionReturn(0); 876 } 877 878 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 879 { 880 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 881 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 882 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 883 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 884 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 885 cusparseIndexBase_t indexBase; 886 cusparseMatrixType_t matrixType; 887 cusparseFillMode_t fillMode; 888 cusparseDiagType_t diagType; 889 890 PetscFunctionBegin; 891 /* allocate space for the transpose of the lower triangular factor */ 892 PetscCall(PetscNew(&loTriFactorT)); 893 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 894 895 /* set the matrix descriptors of the lower triangular factor */ 896 matrixType = cusparseGetMatType(loTriFactor->descr); 897 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 898 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 899 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 900 diagType = cusparseGetMatDiagType(loTriFactor->descr); 901 902 /* Create the matrix description */ 903 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr)); 904 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 905 PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType)); 906 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode)); 907 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType)); 908 909 /* set the operation */ 910 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 911 912 /* allocate GPU space for the CSC of the lower triangular factor*/ 913 loTriFactorT->csrMat = new CsrMatrix; 914 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 915 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 916 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 917 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 918 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 919 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 920 921 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 922 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 923 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 924 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 925 loTriFactor->csrMat->values->data().get(), 926 loTriFactor->csrMat->row_offsets->data().get(), 927 loTriFactor->csrMat->column_indices->data().get(), 928 loTriFactorT->csrMat->values->data().get(), 929 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 930 CUSPARSE_ACTION_NUMERIC,indexBase, 931 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 932 PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize)); 933 #endif 934 935 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 936 PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 937 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 938 loTriFactor->csrMat->values->data().get(), 939 loTriFactor->csrMat->row_offsets->data().get(), 940 loTriFactor->csrMat->column_indices->data().get(), 941 loTriFactorT->csrMat->values->data().get(), 942 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 943 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 944 CUSPARSE_ACTION_NUMERIC, indexBase, 945 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer)); 946 #else 947 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 948 CUSPARSE_ACTION_NUMERIC, indexBase)); 949 #endif 950 PetscCallCUDA(WaitForCUDA()); 951 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 952 953 /* Create the solve analysis information */ 954 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 955 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo)); 956 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 957 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 958 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 959 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 960 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 961 &loTriFactorT->solveBufferSize)); 962 PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize)); 963 #endif 964 965 /* perform the solve analysis */ 966 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 967 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 968 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 969 loTriFactorT->csrMat->column_indices->data().get(), 970 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 971 loTriFactorT->solveInfo, 972 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 973 #else 974 loTriFactorT->solveInfo)); 975 #endif 976 PetscCallCUDA(WaitForCUDA()); 977 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 978 979 /* assign the pointer */ 980 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 981 982 /*********************************************/ 983 /* Now the Transpose of the Upper Tri Factor */ 984 /*********************************************/ 985 986 /* allocate space for the transpose of the upper triangular factor */ 987 PetscCall(PetscNew(&upTriFactorT)); 988 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 989 990 /* set the matrix descriptors of the upper triangular factor */ 991 matrixType = cusparseGetMatType(upTriFactor->descr); 992 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 993 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 994 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 995 diagType = cusparseGetMatDiagType(upTriFactor->descr); 996 997 /* Create the matrix description */ 998 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr)); 999 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 1000 PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType)); 1001 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode)); 1002 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType)); 1003 1004 /* set the operation */ 1005 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1006 1007 /* allocate GPU space for the CSC of the upper triangular factor*/ 1008 upTriFactorT->csrMat = new CsrMatrix; 1009 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1010 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1011 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1012 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1013 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1014 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1015 1016 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1017 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1018 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1019 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1020 upTriFactor->csrMat->values->data().get(), 1021 upTriFactor->csrMat->row_offsets->data().get(), 1022 upTriFactor->csrMat->column_indices->data().get(), 1023 upTriFactorT->csrMat->values->data().get(), 1024 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1025 CUSPARSE_ACTION_NUMERIC,indexBase, 1026 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 1027 PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize)); 1028 #endif 1029 1030 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1031 PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1032 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1033 upTriFactor->csrMat->values->data().get(), 1034 upTriFactor->csrMat->row_offsets->data().get(), 1035 upTriFactor->csrMat->column_indices->data().get(), 1036 upTriFactorT->csrMat->values->data().get(), 1037 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1038 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1039 CUSPARSE_ACTION_NUMERIC, indexBase, 1040 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer)); 1041 #else 1042 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1043 CUSPARSE_ACTION_NUMERIC, indexBase)); 1044 #endif 1045 1046 PetscCallCUDA(WaitForCUDA()); 1047 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1048 1049 /* Create the solve analysis information */ 1050 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1051 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo)); 1052 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1053 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1054 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1055 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1056 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1057 &upTriFactorT->solveBufferSize)); 1058 PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize)); 1059 #endif 1060 1061 /* perform the solve analysis */ 1062 /* christ, would it have killed you to put this stuff in a function????????? */ 1063 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1064 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1065 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1066 upTriFactorT->csrMat->column_indices->data().get(), 1067 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1068 upTriFactorT->solveInfo, 1069 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1070 #else 1071 upTriFactorT->solveInfo)); 1072 #endif 1073 1074 PetscCallCUDA(WaitForCUDA()); 1075 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1076 1077 /* assign the pointer */ 1078 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1079 PetscFunctionReturn(0); 1080 } 1081 1082 struct PetscScalarToPetscInt 1083 { 1084 __host__ __device__ 1085 PetscInt operator()(PetscScalar s) 1086 { 1087 return (PetscInt)PetscRealPart(s); 1088 } 1089 }; 1090 1091 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1092 { 1093 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1094 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1095 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1096 cusparseStatus_t stat; 1097 cusparseIndexBase_t indexBase; 1098 1099 PetscFunctionBegin; 1100 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1101 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1102 PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct"); 1103 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1104 PetscCheck(!A->transupdated || matstructT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct"); 1105 if (A->transupdated) PetscFunctionReturn(0); 1106 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1107 PetscCall(PetscLogGpuTimeBegin()); 1108 if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1109 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 1110 } 1111 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1112 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1113 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr)); 1114 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1115 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase)); 1116 PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1117 1118 /* set alpha and beta */ 1119 PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar))); 1120 PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar))); 1121 PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar))); 1122 PetscCallCUDA(cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1123 PetscCallCUDA(cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1124 PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1125 1126 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1127 CsrMatrix *matrixT = new CsrMatrix; 1128 matstructT->mat = matrixT; 1129 matrixT->num_rows = A->cmap->n; 1130 matrixT->num_cols = A->rmap->n; 1131 matrixT->num_entries = a->nz; 1132 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1133 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1134 matrixT->values = new THRUSTARRAY(a->nz); 1135 1136 if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 1137 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1138 1139 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1140 #if PETSC_PKG_CUDA_VERSION_GE(11,2,1) 1141 stat = cusparseCreateCsr(&matstructT->matDescr, 1142 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1143 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1144 matrixT->values->data().get(), 1145 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1146 indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat); 1147 #else 1148 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 1149 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 1150 1151 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 1152 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 1153 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 1154 */ 1155 if (matrixT->num_entries) { 1156 stat = cusparseCreateCsr(&matstructT->matDescr, 1157 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1158 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1159 matrixT->values->data().get(), 1160 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, 1161 indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat); 1162 1163 } else { 1164 matstructT->matDescr = NULL; 1165 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1166 } 1167 #endif 1168 #endif 1169 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1170 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1171 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1172 #else 1173 CsrMatrix *temp = new CsrMatrix; 1174 CsrMatrix *tempT = new CsrMatrix; 1175 /* First convert HYB to CSR */ 1176 temp->num_rows = A->rmap->n; 1177 temp->num_cols = A->cmap->n; 1178 temp->num_entries = a->nz; 1179 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1180 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1181 temp->values = new THRUSTARRAY(a->nz); 1182 1183 stat = cusparse_hyb2csr(cusparsestruct->handle, 1184 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1185 temp->values->data().get(), 1186 temp->row_offsets->data().get(), 1187 temp->column_indices->data().get());PetscCallCUSPARSE(stat); 1188 1189 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1190 tempT->num_rows = A->rmap->n; 1191 tempT->num_cols = A->cmap->n; 1192 tempT->num_entries = a->nz; 1193 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1194 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1195 tempT->values = new THRUSTARRAY(a->nz); 1196 1197 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1198 temp->num_cols, temp->num_entries, 1199 temp->values->data().get(), 1200 temp->row_offsets->data().get(), 1201 temp->column_indices->data().get(), 1202 tempT->values->data().get(), 1203 tempT->column_indices->data().get(), 1204 tempT->row_offsets->data().get(), 1205 CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat); 1206 1207 /* Last, convert CSC to HYB */ 1208 cusparseHybMat_t hybMat; 1209 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 1210 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1211 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1212 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1213 matstructT->descr, tempT->values->data().get(), 1214 tempT->row_offsets->data().get(), 1215 tempT->column_indices->data().get(), 1216 hybMat, 0, partition);PetscCallCUSPARSE(stat); 1217 1218 /* assign the pointer */ 1219 matstructT->mat = hybMat; 1220 A->transupdated = PETSC_TRUE; 1221 /* delete temporaries */ 1222 if (tempT) { 1223 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1224 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1225 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1226 delete (CsrMatrix*) tempT; 1227 } 1228 if (temp) { 1229 if (temp->values) delete (THRUSTARRAY*) temp->values; 1230 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1231 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1232 delete (CsrMatrix*) temp; 1233 } 1234 #endif 1235 } 1236 } 1237 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1238 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1239 CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1240 PetscCheck(matrix,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix"); 1241 PetscCheck(matrix->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows"); 1242 PetscCheck(matrix->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols"); 1243 PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values"); 1244 PetscCheck(matrixT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT"); 1245 PetscCheck(matrixT->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows"); 1246 PetscCheck(matrixT->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols"); 1247 PetscCheck(matrixT->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values"); 1248 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1249 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1250 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1251 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 1252 } 1253 if (!cusparsestruct->csr2csc_i) { 1254 THRUSTARRAY csr2csc_a(matrix->num_entries); 1255 PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1256 1257 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1258 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1259 void *csr2cscBuffer; 1260 size_t csr2cscBufferSize; 1261 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1262 A->cmap->n, matrix->num_entries, 1263 matrix->values->data().get(), 1264 cusparsestruct->rowoffsets_gpu->data().get(), 1265 matrix->column_indices->data().get(), 1266 matrixT->values->data().get(), 1267 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1268 CUSPARSE_ACTION_NUMERIC,indexBase, 1269 cusparsestruct->csr2cscAlg, &csr2cscBufferSize);PetscCallCUSPARSE(stat); 1270 PetscCallCUDA(cudaMalloc(&csr2cscBuffer,csr2cscBufferSize)); 1271 #endif 1272 1273 if (matrix->num_entries) { 1274 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1275 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1276 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1277 1278 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1279 should be filled with indexBase. So I just take a shortcut here. 1280 */ 1281 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1282 A->cmap->n,matrix->num_entries, 1283 csr2csc_a.data().get(), 1284 cusparsestruct->rowoffsets_gpu->data().get(), 1285 matrix->column_indices->data().get(), 1286 matrixT->values->data().get(), 1287 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1288 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1289 CUSPARSE_ACTION_NUMERIC,indexBase, 1290 cusparsestruct->csr2cscAlg, csr2cscBuffer);PetscCallCUSPARSE(stat); 1291 #else 1292 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1293 CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat); 1294 #endif 1295 } else { 1296 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1297 } 1298 1299 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1300 PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1301 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1302 PetscCallCUDA(cudaFree(csr2cscBuffer)); 1303 #endif 1304 } 1305 PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1306 thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1307 matrixT->values->begin())); 1308 } 1309 PetscCall(PetscLogGpuTimeEnd()); 1310 PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1311 /* the compressed row indices is not used for matTranspose */ 1312 matstructT->cprowIndices = NULL; 1313 /* assign the pointer */ 1314 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1315 A->transupdated = PETSC_TRUE; 1316 PetscFunctionReturn(0); 1317 } 1318 1319 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1320 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1321 { 1322 PetscInt n = xx->map->n; 1323 const PetscScalar *barray; 1324 PetscScalar *xarray; 1325 thrust::device_ptr<const PetscScalar> bGPU; 1326 thrust::device_ptr<PetscScalar> xGPU; 1327 cusparseStatus_t stat; 1328 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1329 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1330 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1331 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1332 1333 PetscFunctionBegin; 1334 /* Analyze the matrix and create the transpose ... on the fly */ 1335 if (!loTriFactorT && !upTriFactorT) { 1336 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1337 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1338 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1339 } 1340 1341 /* Get the GPU pointers */ 1342 PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 1343 PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1344 xGPU = thrust::device_pointer_cast(xarray); 1345 bGPU = thrust::device_pointer_cast(barray); 1346 1347 PetscCall(PetscLogGpuTimeBegin()); 1348 /* First, reorder with the row permutation */ 1349 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1350 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1351 xGPU); 1352 1353 /* First, solve U */ 1354 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1355 upTriFactorT->csrMat->num_rows, 1356 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1357 upTriFactorT->csrMat->num_entries, 1358 #endif 1359 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1360 upTriFactorT->csrMat->values->data().get(), 1361 upTriFactorT->csrMat->row_offsets->data().get(), 1362 upTriFactorT->csrMat->column_indices->data().get(), 1363 upTriFactorT->solveInfo, 1364 xarray, 1365 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1366 tempGPU->data().get(), 1367 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1368 #else 1369 tempGPU->data().get());PetscCallCUSPARSE(stat); 1370 #endif 1371 1372 /* Then, solve L */ 1373 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1374 loTriFactorT->csrMat->num_rows, 1375 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1376 loTriFactorT->csrMat->num_entries, 1377 #endif 1378 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1379 loTriFactorT->csrMat->values->data().get(), 1380 loTriFactorT->csrMat->row_offsets->data().get(), 1381 loTriFactorT->csrMat->column_indices->data().get(), 1382 loTriFactorT->solveInfo, 1383 tempGPU->data().get(), 1384 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1385 xarray, 1386 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1387 #else 1388 xarray);PetscCallCUSPARSE(stat); 1389 #endif 1390 1391 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1392 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1393 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1394 tempGPU->begin()); 1395 1396 /* Copy the temporary to the full solution. */ 1397 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1398 1399 /* restore */ 1400 PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 1401 PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 1402 PetscCall(PetscLogGpuTimeEnd()); 1403 PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1404 PetscFunctionReturn(0); 1405 } 1406 1407 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1408 { 1409 const PetscScalar *barray; 1410 PetscScalar *xarray; 1411 cusparseStatus_t stat; 1412 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1413 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1414 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1415 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1416 1417 PetscFunctionBegin; 1418 /* Analyze the matrix and create the transpose ... on the fly */ 1419 if (!loTriFactorT && !upTriFactorT) { 1420 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1421 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1422 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1423 } 1424 1425 /* Get the GPU pointers */ 1426 PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 1427 PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1428 1429 PetscCall(PetscLogGpuTimeBegin()); 1430 /* First, solve U */ 1431 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1432 upTriFactorT->csrMat->num_rows, 1433 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1434 upTriFactorT->csrMat->num_entries, 1435 #endif 1436 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1437 upTriFactorT->csrMat->values->data().get(), 1438 upTriFactorT->csrMat->row_offsets->data().get(), 1439 upTriFactorT->csrMat->column_indices->data().get(), 1440 upTriFactorT->solveInfo, 1441 barray, 1442 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1443 tempGPU->data().get(), 1444 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1445 #else 1446 tempGPU->data().get());PetscCallCUSPARSE(stat); 1447 #endif 1448 1449 /* Then, solve L */ 1450 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1451 loTriFactorT->csrMat->num_rows, 1452 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1453 loTriFactorT->csrMat->num_entries, 1454 #endif 1455 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1456 loTriFactorT->csrMat->values->data().get(), 1457 loTriFactorT->csrMat->row_offsets->data().get(), 1458 loTriFactorT->csrMat->column_indices->data().get(), 1459 loTriFactorT->solveInfo, 1460 tempGPU->data().get(), 1461 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1462 xarray, 1463 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1464 #else 1465 xarray);PetscCallCUSPARSE(stat); 1466 #endif 1467 1468 /* restore */ 1469 PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 1470 PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 1471 PetscCall(PetscLogGpuTimeEnd()); 1472 PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1473 PetscFunctionReturn(0); 1474 } 1475 1476 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1477 { 1478 const PetscScalar *barray; 1479 PetscScalar *xarray; 1480 thrust::device_ptr<const PetscScalar> bGPU; 1481 thrust::device_ptr<PetscScalar> xGPU; 1482 cusparseStatus_t stat; 1483 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1484 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1485 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1486 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1487 1488 PetscFunctionBegin; 1489 1490 /* Get the GPU pointers */ 1491 PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 1492 PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1493 xGPU = thrust::device_pointer_cast(xarray); 1494 bGPU = thrust::device_pointer_cast(barray); 1495 1496 PetscCall(PetscLogGpuTimeBegin()); 1497 /* First, reorder with the row permutation */ 1498 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1499 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1500 tempGPU->begin()); 1501 1502 /* Next, solve L */ 1503 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1504 loTriFactor->csrMat->num_rows, 1505 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1506 loTriFactor->csrMat->num_entries, 1507 #endif 1508 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1509 loTriFactor->csrMat->values->data().get(), 1510 loTriFactor->csrMat->row_offsets->data().get(), 1511 loTriFactor->csrMat->column_indices->data().get(), 1512 loTriFactor->solveInfo, 1513 tempGPU->data().get(), 1514 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1515 xarray, 1516 loTriFactor->solvePolicy, loTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1517 #else 1518 xarray);PetscCallCUSPARSE(stat); 1519 #endif 1520 1521 /* Then, solve U */ 1522 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1523 upTriFactor->csrMat->num_rows, 1524 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1525 upTriFactor->csrMat->num_entries, 1526 #endif 1527 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1528 upTriFactor->csrMat->values->data().get(), 1529 upTriFactor->csrMat->row_offsets->data().get(), 1530 upTriFactor->csrMat->column_indices->data().get(), 1531 upTriFactor->solveInfo,xarray, 1532 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1533 tempGPU->data().get(), 1534 upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1535 #else 1536 tempGPU->data().get());PetscCallCUSPARSE(stat); 1537 #endif 1538 1539 /* Last, reorder with the column permutation */ 1540 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1541 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1542 xGPU); 1543 1544 PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 1545 PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 1546 PetscCall(PetscLogGpuTimeEnd()); 1547 PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1548 PetscFunctionReturn(0); 1549 } 1550 1551 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1552 { 1553 const PetscScalar *barray; 1554 PetscScalar *xarray; 1555 cusparseStatus_t stat; 1556 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1557 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1558 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1559 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1560 1561 PetscFunctionBegin; 1562 /* Get the GPU pointers */ 1563 PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 1564 PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1565 1566 PetscCall(PetscLogGpuTimeBegin()); 1567 /* First, solve L */ 1568 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1569 loTriFactor->csrMat->num_rows, 1570 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1571 loTriFactor->csrMat->num_entries, 1572 #endif 1573 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1574 loTriFactor->csrMat->values->data().get(), 1575 loTriFactor->csrMat->row_offsets->data().get(), 1576 loTriFactor->csrMat->column_indices->data().get(), 1577 loTriFactor->solveInfo, 1578 barray, 1579 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1580 tempGPU->data().get(), 1581 loTriFactor->solvePolicy,loTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1582 #else 1583 tempGPU->data().get());PetscCallCUSPARSE(stat); 1584 #endif 1585 1586 /* Next, solve U */ 1587 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1588 upTriFactor->csrMat->num_rows, 1589 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1590 upTriFactor->csrMat->num_entries, 1591 #endif 1592 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1593 upTriFactor->csrMat->values->data().get(), 1594 upTriFactor->csrMat->row_offsets->data().get(), 1595 upTriFactor->csrMat->column_indices->data().get(), 1596 upTriFactor->solveInfo, 1597 tempGPU->data().get(), 1598 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1599 xarray, 1600 upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1601 #else 1602 xarray);PetscCallCUSPARSE(stat); 1603 #endif 1604 1605 PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 1606 PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 1607 PetscCall(PetscLogGpuTimeEnd()); 1608 PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1609 PetscFunctionReturn(0); 1610 } 1611 1612 #if CUSPARSE_VERSION >= 11500 1613 /* cusparseSpSV_solve() and friends first appeared in cusparse-11.3 */ 1614 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ILU0(Mat fact,Vec b,Vec x) 1615 { 1616 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr; 1617 Mat_SeqAIJ *aij = (Mat_SeqAIJ*)fact->data; 1618 const PetscScalar *barray; 1619 PetscScalar *xarray; 1620 1621 PetscFunctionBegin; 1622 PetscCall(VecCUDAGetArrayWrite(x,&xarray)); 1623 PetscCall(VecCUDAGetArrayRead(b,&barray)); 1624 PetscCall(PetscLogGpuTimeBegin()); 1625 1626 /* Solve L*y = b */ 1627 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,(void*)barray)); 1628 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y,fs->Y)); 1629 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, 1630 CUSPARSE_OPERATION_NON_TRANSPOSE, 1631 &PETSC_CUSPARSE_ONE, 1632 fs->spMatDescr_L, /* L Y = X */ 1633 fs->dnVecDescr_X, 1634 fs->dnVecDescr_Y, 1635 cusparse_scalartype, 1636 CUSPARSE_SPSV_ALG_DEFAULT, 1637 fs->spsvDescr_L)); // cusparseSpSV_solve() scretely uses the external buffer used in cusparseSpSV_analysis()! 1638 1639 /* Solve U*x = y */ 1640 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,xarray)); 1641 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, 1642 CUSPARSE_OPERATION_NON_TRANSPOSE, 1643 &PETSC_CUSPARSE_ONE, 1644 fs->spMatDescr_U, /* U X = Y */ 1645 fs->dnVecDescr_Y, 1646 fs->dnVecDescr_X, 1647 cusparse_scalartype, 1648 CUSPARSE_SPSV_ALG_DEFAULT, 1649 fs->spsvDescr_U)); 1650 1651 PetscCall(VecCUDARestoreArrayRead(b,&barray)); 1652 PetscCall(VecCUDARestoreArrayWrite(x,&xarray)); 1653 1654 PetscCall(PetscLogGpuTimeEnd()); 1655 PetscCall(PetscLogGpuFlops(2.0*aij->nz - fact->rmap->n)); 1656 PetscFunctionReturn(0); 1657 } 1658 1659 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_ILU0(Mat fact,Vec b,Vec x) 1660 { 1661 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr; 1662 Mat_SeqAIJ *aij = (Mat_SeqAIJ*)fact->data; 1663 const PetscScalar *barray; 1664 PetscScalar *xarray; 1665 1666 PetscFunctionBegin; 1667 if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */ 1668 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 1669 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, 1670 CUSPARSE_OPERATION_TRANSPOSE, 1671 &PETSC_CUSPARSE_ONE, 1672 fs->spMatDescr_L, /* The matrix is still L. We only do tranpose solve with it */ 1673 fs->dnVecDescr_X, 1674 fs->dnVecDescr_Y, 1675 cusparse_scalartype, 1676 CUSPARSE_SPSV_ALG_DEFAULT, 1677 fs->spsvDescr_Lt, 1678 &fs->spsvBufferSize_Lt)); 1679 1680 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); 1681 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, 1682 CUSPARSE_OPERATION_TRANSPOSE, 1683 &PETSC_CUSPARSE_ONE, 1684 fs->spMatDescr_U, 1685 fs->dnVecDescr_X, 1686 fs->dnVecDescr_Y, 1687 cusparse_scalartype, 1688 CUSPARSE_SPSV_ALG_DEFAULT, 1689 fs->spsvDescr_Ut, 1690 &fs->spsvBufferSize_Ut)); 1691 PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_Lt,fs->spsvBufferSize_Lt)); 1692 PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_Ut,fs->spsvBufferSize_Ut)); 1693 fs->createdTransposeSpSVDescr = PETSC_TRUE; 1694 } 1695 1696 if (!fs->updatedTransposeSpSVAnalysis) { 1697 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, 1698 CUSPARSE_OPERATION_TRANSPOSE, 1699 &PETSC_CUSPARSE_ONE, 1700 fs->spMatDescr_L, 1701 fs->dnVecDescr_X, 1702 fs->dnVecDescr_Y, 1703 cusparse_scalartype, 1704 CUSPARSE_SPSV_ALG_DEFAULT, 1705 fs->spsvDescr_Lt, 1706 fs->spsvBuffer_Lt)); 1707 1708 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, 1709 CUSPARSE_OPERATION_TRANSPOSE, 1710 &PETSC_CUSPARSE_ONE, 1711 fs->spMatDescr_U, 1712 fs->dnVecDescr_X, 1713 fs->dnVecDescr_Y, 1714 cusparse_scalartype, 1715 CUSPARSE_SPSV_ALG_DEFAULT, 1716 fs->spsvDescr_Ut, 1717 fs->spsvBuffer_Ut)); 1718 fs->updatedTransposeSpSVAnalysis = PETSC_TRUE; 1719 } 1720 1721 PetscCall(VecCUDAGetArrayWrite(x,&xarray)); 1722 PetscCall(VecCUDAGetArrayRead(b,&barray)); 1723 PetscCall(PetscLogGpuTimeBegin()); 1724 1725 /* Solve Ut*y = b */ 1726 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,(void*)barray)); 1727 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y,fs->Y)); 1728 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, 1729 CUSPARSE_OPERATION_TRANSPOSE, 1730 &PETSC_CUSPARSE_ONE, 1731 fs->spMatDescr_U, /* Ut Y = X */ 1732 fs->dnVecDescr_X, 1733 fs->dnVecDescr_Y, 1734 cusparse_scalartype, 1735 CUSPARSE_SPSV_ALG_DEFAULT, 1736 fs->spsvDescr_Ut)); 1737 1738 /* Solve Lt*x = y */ 1739 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,xarray)); 1740 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, 1741 CUSPARSE_OPERATION_TRANSPOSE, 1742 &PETSC_CUSPARSE_ONE, 1743 fs->spMatDescr_L, /* Lt X = Y */ 1744 fs->dnVecDescr_Y, 1745 fs->dnVecDescr_X, 1746 cusparse_scalartype, 1747 CUSPARSE_SPSV_ALG_DEFAULT, 1748 fs->spsvDescr_Lt)); 1749 1750 PetscCall(VecCUDARestoreArrayRead(b,&barray)); 1751 PetscCall(VecCUDARestoreArrayWrite(x,&xarray)); 1752 PetscCall(PetscLogGpuTimeEnd()); 1753 PetscCall(PetscLogGpuFlops(2.0*aij->nz - fact->rmap->n)); 1754 PetscFunctionReturn(0); 1755 } 1756 1757 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact,Mat A,const MatFactorInfo *info) 1758 { 1759 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr; 1760 Mat_SeqAIJ *aij = (Mat_SeqAIJ*)fact->data; 1761 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1762 CsrMatrix *Acsr; 1763 PetscInt m,nz; 1764 PetscBool flg; 1765 1766 PetscFunctionBegin; 1767 if (PetscDefined(USE_DEBUG)) { 1768 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 1769 PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name); 1770 } 1771 1772 /* Copy A's value to fact */ 1773 m = fact->rmap->n; 1774 nz = aij->nz; 1775 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1776 Acsr = (CsrMatrix*)Acusp->mat->mat; 1777 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal,Acsr->values->data().get(),sizeof(PetscScalar)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream)); 1778 1779 /* Factorize fact inplace */ 1780 if (m) PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1781 fs->matDescr_M, 1782 fs->csrVal, 1783 fs->csrRowPtr, 1784 fs->csrColIdx, 1785 fs->ilu0Info_M, 1786 fs->policy_M, 1787 fs->factBuffer_M)); 1788 if (PetscDefined(USE_DEBUG)) { 1789 int numerical_zero; 1790 cusparseStatus_t status; 1791 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero); 1792 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Numerical zero pivot detected in csrilu02: A(%d,%d) is zero",numerical_zero,numerical_zero); 1793 } 1794 1795 /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02() 1796 See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78 1797 */ 1798 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, 1799 CUSPARSE_OPERATION_NON_TRANSPOSE, 1800 &PETSC_CUSPARSE_ONE, 1801 fs->spMatDescr_L, 1802 fs->dnVecDescr_X, 1803 fs->dnVecDescr_Y, 1804 cusparse_scalartype, 1805 CUSPARSE_SPSV_ALG_DEFAULT, 1806 fs->spsvDescr_L, 1807 fs->spsvBuffer_L)); 1808 1809 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, 1810 CUSPARSE_OPERATION_NON_TRANSPOSE, 1811 &PETSC_CUSPARSE_ONE, 1812 fs->spMatDescr_U, 1813 fs->dnVecDescr_X, 1814 fs->dnVecDescr_Y, 1815 cusparse_scalartype, 1816 CUSPARSE_SPSV_ALG_DEFAULT, 1817 fs->spsvDescr_U, 1818 fs->spsvBuffer_U)); 1819 1820 /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */ 1821 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 1822 1823 fact->offloadmask = PETSC_OFFLOAD_GPU; 1824 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ILU0; 1825 fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_ILU0; 1826 fact->ops->matsolve = NULL; 1827 fact->ops->matsolvetranspose = NULL; 1828 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1829 PetscFunctionReturn(0); 1830 } 1831 1832 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1833 { 1834 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr; 1835 Mat_SeqAIJ *aij = (Mat_SeqAIJ*)fact->data; 1836 PetscInt m,nz; 1837 1838 PetscFunctionBegin; 1839 if (PetscDefined(USE_DEBUG)) { 1840 PetscInt i; 1841 PetscBool flg,missing; 1842 1843 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 1844 PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name); 1845 PetscCheck(A->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT,A->rmap->n,A->cmap->n); 1846 PetscCall(MatMissingDiagonal(A,&missing,&i)); 1847 PetscCheck(!missing,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %" PetscInt_FMT,i); 1848 } 1849 1850 /* Free the old stale stuff */ 1851 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 1852 1853 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1854 but they will not be used. Allocate them just for easy debugging. 1855 */ 1856 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE/*malloc*/)); 1857 1858 fact->offloadmask = PETSC_OFFLOAD_BOTH; 1859 fact->factortype = MAT_FACTOR_ILU; 1860 fact->info.factor_mallocs = 0; 1861 fact->info.fill_ratio_given = info->fill; 1862 fact->info.fill_ratio_needed = 1.0; 1863 1864 aij->row = NULL; 1865 aij->col = NULL; 1866 1867 /* ====================================================================== */ 1868 /* Copy A's i, j to fact and also allocate the value array of fact. */ 1869 /* We'll do in-place factorization on fact */ 1870 /* ====================================================================== */ 1871 const int *Ai,*Aj; 1872 1873 m = fact->rmap->n; 1874 nz = aij->nz; 1875 1876 PetscCallCUDA(cudaMalloc((void**)&fs->csrRowPtr,sizeof(int)*(m+1))); 1877 PetscCallCUDA(cudaMalloc((void**)&fs->csrColIdx,sizeof(int)*nz)); 1878 PetscCallCUDA(cudaMalloc((void**)&fs->csrVal,sizeof(PetscScalar)*nz)); 1879 PetscCall(MatSeqAIJCUSPARSEGetIJ(A,PETSC_FALSE,&Ai,&Aj)); /* Do not use compressed Ai */ 1880 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr,Ai,sizeof(int)*(m+1),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream)); 1881 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx,Aj,sizeof(int)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream)); 1882 1883 /* ====================================================================== */ 1884 /* Create descriptors for M, L, U */ 1885 /* ====================================================================== */ 1886 cusparseFillMode_t fillMode; 1887 cusparseDiagType_t diagType; 1888 1889 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 1890 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 1891 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 1892 1893 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 1894 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1895 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1896 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1897 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1898 */ 1899 fillMode = CUSPARSE_FILL_MODE_LOWER; 1900 diagType = CUSPARSE_DIAG_TYPE_UNIT; 1901 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L,m,m,nz, 1902 fs->csrRowPtr, 1903 fs->csrColIdx, 1904 fs->csrVal, 1905 CUSPARSE_INDEX_32I, 1906 CUSPARSE_INDEX_32I, 1907 CUSPARSE_INDEX_BASE_ZERO, 1908 cusparse_scalartype)); 1909 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, 1910 CUSPARSE_SPMAT_FILL_MODE, 1911 &fillMode, 1912 sizeof(fillMode))); 1913 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, 1914 CUSPARSE_SPMAT_DIAG_TYPE, 1915 &diagType, 1916 sizeof(diagType))); 1917 1918 fillMode = CUSPARSE_FILL_MODE_UPPER; 1919 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 1920 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U,m,m,nz, 1921 fs->csrRowPtr, 1922 fs->csrColIdx, 1923 fs->csrVal, 1924 CUSPARSE_INDEX_32I, 1925 CUSPARSE_INDEX_32I, 1926 CUSPARSE_INDEX_BASE_ZERO, 1927 cusparse_scalartype)); 1928 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, 1929 CUSPARSE_SPMAT_FILL_MODE, 1930 &fillMode, 1931 sizeof(fillMode))); 1932 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, 1933 CUSPARSE_SPMAT_DIAG_TYPE, 1934 &diagType, 1935 sizeof(diagType))); 1936 1937 /* ========================================================================= */ 1938 /* Query buffer sizes for csrilu0, SpSV and allocate buffers */ 1939 /* ========================================================================= */ 1940 PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M)); 1941 if (m) PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1942 fs->matDescr_M, 1943 fs->csrVal, 1944 fs->csrRowPtr, 1945 fs->csrColIdx, 1946 fs->ilu0Info_M, 1947 &fs->factBufferSize_M)); 1948 1949 PetscCallCUDA(cudaMalloc((void**)&fs->X,sizeof(PetscScalar)*m)); 1950 PetscCallCUDA(cudaMalloc((void**)&fs->Y,sizeof(PetscScalar)*m)); 1951 1952 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X,m,fs->X,cusparse_scalartype)); 1953 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y,m,fs->Y,cusparse_scalartype)); 1954 1955 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 1956 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, 1957 CUSPARSE_OPERATION_NON_TRANSPOSE, 1958 &PETSC_CUSPARSE_ONE, 1959 fs->spMatDescr_L, 1960 fs->dnVecDescr_X, 1961 fs->dnVecDescr_Y, 1962 cusparse_scalartype, 1963 CUSPARSE_SPSV_ALG_DEFAULT, 1964 fs->spsvDescr_L, 1965 &fs->spsvBufferSize_L)); 1966 1967 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 1968 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, 1969 CUSPARSE_OPERATION_NON_TRANSPOSE, 1970 &PETSC_CUSPARSE_ONE, 1971 fs->spMatDescr_U, 1972 fs->dnVecDescr_X, 1973 fs->dnVecDescr_Y, 1974 cusparse_scalartype, 1975 CUSPARSE_SPSV_ALG_DEFAULT, 1976 fs->spsvDescr_U, 1977 &fs->spsvBufferSize_U)); 1978 1979 /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab, 1980 and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77, 1981 spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U. 1982 To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U. 1983 */ 1984 if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) { 1985 PetscCallCUDA(cudaMalloc((void**)&fs->factBuffer_M,PetscMax(fs->spsvBufferSize_L,(size_t)fs->factBufferSize_M))); 1986 fs->spsvBuffer_L = fs->factBuffer_M; 1987 PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_U,fs->spsvBufferSize_U)); 1988 } else { 1989 PetscCallCUDA(cudaMalloc((void**)&fs->factBuffer_M,PetscMax(fs->spsvBufferSize_U,(size_t)fs->factBufferSize_M))); 1990 fs->spsvBuffer_U = fs->factBuffer_M; 1991 PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_L,fs->spsvBufferSize_L)); 1992 } 1993 1994 /* ========================================================================== */ 1995 /* Perform analysis of ilu0 on M, SpSv on L and U */ 1996 /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/ 1997 /* ========================================================================== */ 1998 int structural_zero; 1999 cusparseStatus_t status; 2000 2001 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 2002 if (m) PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 2003 fs->matDescr_M, 2004 fs->csrVal, 2005 fs->csrRowPtr, 2006 fs->csrColIdx, 2007 fs->ilu0Info_M, 2008 fs->policy_M, 2009 fs->factBuffer_M)); 2010 if (PetscDefined(USE_DEBUG)) { 2011 /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 2012 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero); 2013 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Structural zero pivot detected in csrilu02: A(%d,%d) is missing",structural_zero,structural_zero); 2014 } 2015 2016 /* Estimate FLOPs of the numeric factorization */ 2017 { 2018 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ*)A->data; 2019 PetscInt *Ai,*Adiag,nzRow,nzLeft; 2020 PetscLogDouble flops = 0.0; 2021 2022 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 2023 Ai = Aseq->i; 2024 Adiag = Aseq->diag; 2025 for (PetscInt i=0; i<m; i++) { 2026 if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i+1]) { /* There are nonzeros left to the diagonal of row i */ 2027 nzRow = Ai[i+1] - Ai[i]; 2028 nzLeft = Adiag[i] - Ai[i]; 2029 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 2030 and include the eliminated one will be updated, which incurs a multiplication and an addition. 2031 */ 2032 nzLeft = (nzRow-1)/2; 2033 flops += nzLeft*(2.0*nzRow-nzLeft+1); 2034 } 2035 } 2036 fs->numericFactFlops = flops; 2037 } 2038 fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0; 2039 PetscFunctionReturn(0); 2040 } 2041 2042 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact,Vec b,Vec x) 2043 { 2044 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr; 2045 Mat_SeqAIJ *aij = (Mat_SeqAIJ*)fact->data; 2046 const PetscScalar *barray; 2047 PetscScalar *xarray; 2048 2049 PetscFunctionBegin; 2050 PetscCall(VecCUDAGetArrayWrite(x,&xarray)); 2051 PetscCall(VecCUDAGetArrayRead(b,&barray)); 2052 PetscCall(PetscLogGpuTimeBegin()); 2053 2054 /* Solve L*y = b */ 2055 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,(void*)barray)); 2056 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y,fs->Y)); 2057 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, 2058 CUSPARSE_OPERATION_NON_TRANSPOSE, 2059 &PETSC_CUSPARSE_ONE, 2060 fs->spMatDescr_L, /* L Y = X */ 2061 fs->dnVecDescr_X, 2062 fs->dnVecDescr_Y, 2063 cusparse_scalartype, 2064 CUSPARSE_SPSV_ALG_DEFAULT, 2065 fs->spsvDescr_L)); 2066 2067 /* Solve Lt*x = y */ 2068 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,xarray)); 2069 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, 2070 CUSPARSE_OPERATION_TRANSPOSE, 2071 &PETSC_CUSPARSE_ONE, 2072 fs->spMatDescr_L, /* Lt X = Y */ 2073 fs->dnVecDescr_Y, 2074 fs->dnVecDescr_X, 2075 cusparse_scalartype, 2076 CUSPARSE_SPSV_ALG_DEFAULT, 2077 fs->spsvDescr_Lt)); 2078 2079 PetscCall(VecCUDARestoreArrayRead(b,&barray)); 2080 PetscCall(VecCUDARestoreArrayWrite(x,&xarray)); 2081 2082 PetscCall(PetscLogGpuTimeEnd()); 2083 PetscCall(PetscLogGpuFlops(2.0*aij->nz - fact->rmap->n)); 2084 PetscFunctionReturn(0); 2085 } 2086 2087 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact,Mat A,const MatFactorInfo *info) 2088 { 2089 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr; 2090 Mat_SeqAIJ *aij = (Mat_SeqAIJ*)fact->data; 2091 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2092 CsrMatrix *Acsr; 2093 PetscInt m,nz; 2094 PetscBool flg; 2095 2096 PetscFunctionBegin; 2097 if (PetscDefined(USE_DEBUG)) { 2098 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2099 PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name); 2100 } 2101 2102 /* Copy A's value to fact */ 2103 m = fact->rmap->n; 2104 nz = aij->nz; 2105 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2106 Acsr = (CsrMatrix*)Acusp->mat->mat; 2107 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal,Acsr->values->data().get(),sizeof(PetscScalar)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream)); 2108 2109 /* Factorize fact inplace */ 2110 /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve 2111 Function csric02() only takes the lower triangular part of matrix A to perform factorization. 2112 The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored, 2113 and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not. 2114 In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided. 2115 */ 2116 if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, 2117 fs->matDescr_M, 2118 fs->csrVal, 2119 fs->csrRowPtr, 2120 fs->csrColIdx, 2121 fs->ic0Info_M, 2122 fs->policy_M, 2123 fs->factBuffer_M)); 2124 if (PetscDefined(USE_DEBUG)) { 2125 int numerical_zero; 2126 cusparseStatus_t status; 2127 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero); 2128 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Numerical zero pivot detected in csric02: A(%d,%d) is zero",numerical_zero,numerical_zero); 2129 } 2130 2131 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, 2132 CUSPARSE_OPERATION_NON_TRANSPOSE, 2133 &PETSC_CUSPARSE_ONE, 2134 fs->spMatDescr_L, 2135 fs->dnVecDescr_X, 2136 fs->dnVecDescr_Y, 2137 cusparse_scalartype, 2138 CUSPARSE_SPSV_ALG_DEFAULT, 2139 fs->spsvDescr_L, 2140 fs->spsvBuffer_L)); 2141 2142 /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE 2143 ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F 2144 */ 2145 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, 2146 CUSPARSE_OPERATION_TRANSPOSE, 2147 &PETSC_CUSPARSE_ONE, 2148 fs->spMatDescr_L, 2149 fs->dnVecDescr_X, 2150 fs->dnVecDescr_Y, 2151 cusparse_scalartype, 2152 CUSPARSE_SPSV_ALG_DEFAULT, 2153 fs->spsvDescr_Lt, 2154 fs->spsvBuffer_Lt)); 2155 2156 fact->offloadmask = PETSC_OFFLOAD_GPU; 2157 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0; 2158 fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0; 2159 fact->ops->matsolve = NULL; 2160 fact->ops->matsolvetranspose = NULL; 2161 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 2162 PetscFunctionReturn(0); 2163 } 2164 2165 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact,Mat A,IS perm,const MatFactorInfo *info) 2166 { 2167 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr; 2168 Mat_SeqAIJ *aij = (Mat_SeqAIJ*)fact->data; 2169 PetscInt m,nz; 2170 2171 PetscFunctionBegin; 2172 if (PetscDefined(USE_DEBUG)) { 2173 PetscInt i; 2174 PetscBool flg,missing; 2175 2176 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2177 PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name); 2178 PetscCheck(A->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT,A->rmap->n,A->cmap->n); 2179 PetscCall(MatMissingDiagonal(A,&missing,&i)); 2180 PetscCheck(!missing,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %" PetscInt_FMT,i); 2181 } 2182 2183 /* Free the old stale stuff */ 2184 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 2185 2186 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 2187 but they will not be used. Allocate them just for easy debugging. 2188 */ 2189 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE/*malloc*/)); 2190 2191 fact->offloadmask = PETSC_OFFLOAD_BOTH; 2192 fact->factortype = MAT_FACTOR_ICC; 2193 fact->info.factor_mallocs = 0; 2194 fact->info.fill_ratio_given = info->fill; 2195 fact->info.fill_ratio_needed = 1.0; 2196 2197 aij->row = NULL; 2198 aij->col = NULL; 2199 2200 /* ====================================================================== */ 2201 /* Copy A's i, j to fact and also allocate the value array of fact. */ 2202 /* We'll do in-place factorization on fact */ 2203 /* ====================================================================== */ 2204 const int *Ai,*Aj; 2205 2206 m = fact->rmap->n; 2207 nz = aij->nz; 2208 2209 PetscCallCUDA(cudaMalloc((void**)&fs->csrRowPtr,sizeof(int)*(m+1))); 2210 PetscCallCUDA(cudaMalloc((void**)&fs->csrColIdx,sizeof(int)*nz)); 2211 PetscCallCUDA(cudaMalloc((void**)&fs->csrVal,sizeof(PetscScalar)*nz)); 2212 PetscCall(MatSeqAIJCUSPARSEGetIJ(A,PETSC_FALSE,&Ai,&Aj)); /* Do not use compressed Ai */ 2213 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr,Ai,sizeof(int)*(m+1),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream)); 2214 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx,Aj,sizeof(int)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream)); 2215 2216 /* ====================================================================== */ 2217 /* Create mat descriptors for M, L */ 2218 /* ====================================================================== */ 2219 cusparseFillMode_t fillMode; 2220 cusparseDiagType_t diagType; 2221 2222 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 2223 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 2224 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 2225 2226 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 2227 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 2228 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 2229 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 2230 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 2231 */ 2232 fillMode = CUSPARSE_FILL_MODE_LOWER; 2233 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 2234 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L,m,m,nz, 2235 fs->csrRowPtr, 2236 fs->csrColIdx, 2237 fs->csrVal, 2238 CUSPARSE_INDEX_32I, 2239 CUSPARSE_INDEX_32I, 2240 CUSPARSE_INDEX_BASE_ZERO, 2241 cusparse_scalartype)); 2242 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, 2243 CUSPARSE_SPMAT_FILL_MODE, 2244 &fillMode, 2245 sizeof(fillMode))); 2246 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, 2247 CUSPARSE_SPMAT_DIAG_TYPE, 2248 &diagType, 2249 sizeof(diagType))); 2250 2251 /* ========================================================================= */ 2252 /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */ 2253 /* ========================================================================= */ 2254 PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M)); 2255 if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, 2256 fs->matDescr_M, 2257 fs->csrVal, 2258 fs->csrRowPtr, 2259 fs->csrColIdx, 2260 fs->ic0Info_M, 2261 &fs->factBufferSize_M)); 2262 2263 PetscCallCUDA(cudaMalloc((void**)&fs->X,sizeof(PetscScalar)*m)); 2264 PetscCallCUDA(cudaMalloc((void**)&fs->Y,sizeof(PetscScalar)*m)); 2265 2266 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X,m,fs->X,cusparse_scalartype)); 2267 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y,m,fs->Y,cusparse_scalartype)); 2268 2269 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 2270 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, 2271 CUSPARSE_OPERATION_NON_TRANSPOSE, 2272 &PETSC_CUSPARSE_ONE, 2273 fs->spMatDescr_L, 2274 fs->dnVecDescr_X, 2275 fs->dnVecDescr_Y, 2276 cusparse_scalartype, 2277 CUSPARSE_SPSV_ALG_DEFAULT, 2278 fs->spsvDescr_L, 2279 &fs->spsvBufferSize_L)); 2280 2281 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 2282 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, 2283 CUSPARSE_OPERATION_TRANSPOSE, 2284 &PETSC_CUSPARSE_ONE, 2285 fs->spMatDescr_L, 2286 fs->dnVecDescr_X, 2287 fs->dnVecDescr_Y, 2288 cusparse_scalartype, 2289 CUSPARSE_SPSV_ALG_DEFAULT, 2290 fs->spsvDescr_Lt, 2291 &fs->spsvBufferSize_Lt)); 2292 2293 /* To save device memory, we make the factorization buffer share with one of the solver buffer. 2294 See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(). 2295 */ 2296 if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) { 2297 PetscCallCUDA(cudaMalloc((void**)&fs->factBuffer_M,PetscMax(fs->spsvBufferSize_L,(size_t)fs->factBufferSize_M))); 2298 fs->spsvBuffer_L = fs->factBuffer_M; 2299 PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_Lt,fs->spsvBufferSize_Lt)); 2300 } else { 2301 PetscCallCUDA(cudaMalloc((void**)&fs->factBuffer_M,PetscMax(fs->spsvBufferSize_Lt,(size_t)fs->factBufferSize_M))); 2302 fs->spsvBuffer_Lt = fs->factBuffer_M; 2303 PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_L,fs->spsvBufferSize_L)); 2304 } 2305 2306 /* ========================================================================== */ 2307 /* Perform analysis of ic0 on M */ 2308 /* The lower triangular part of M has the same sparsity pattern as L */ 2309 /* ========================================================================== */ 2310 int structural_zero; 2311 cusparseStatus_t status; 2312 2313 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 2314 if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, 2315 fs->matDescr_M, 2316 fs->csrVal, 2317 fs->csrRowPtr, 2318 fs->csrColIdx, 2319 fs->ic0Info_M, 2320 fs->policy_M, 2321 fs->factBuffer_M)); 2322 if (PetscDefined(USE_DEBUG)) { 2323 /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 2324 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero); 2325 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Structural zero pivot detected in csric02: A(%d,%d) is missing",structural_zero,structural_zero); 2326 } 2327 2328 /* Estimate FLOPs of the numeric factorization */ 2329 { 2330 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ*)A->data; 2331 PetscInt *Ai,nzRow,nzLeft; 2332 PetscLogDouble flops = 0.0; 2333 2334 Ai = Aseq->i; 2335 for (PetscInt i=0; i<m; i++) { 2336 nzRow = Ai[i+1] - Ai[i]; 2337 if (nzRow > 1) { 2338 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 2339 and include the eliminated one will be updated, which incurs a multiplication and an addition. 2340 */ 2341 nzLeft = (nzRow-1)/2; 2342 flops += nzLeft*(2.0*nzRow-nzLeft+1); 2343 } 2344 } 2345 fs->numericFactFlops = flops; 2346 } 2347 fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0; 2348 PetscFunctionReturn(0); 2349 } 2350 #endif 2351 2352 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 2353 { 2354 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 2355 2356 PetscFunctionBegin; 2357 #if CUSPARSE_VERSION >= 11500 2358 PetscBool row_identity,col_identity; 2359 PetscCall(ISIdentity(isrow,&row_identity)); 2360 PetscCall(ISIdentity(iscol,&col_identity)); 2361 if (!info->levels && row_identity && col_identity) { 2362 PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B,A,isrow,iscol,info)); 2363 } else 2364 #endif 2365 { 2366 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2367 PetscCall(MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 2368 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2369 } 2370 PetscFunctionReturn(0); 2371 } 2372 2373 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 2374 { 2375 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 2376 2377 PetscFunctionBegin; 2378 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2379 PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 2380 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2381 PetscFunctionReturn(0); 2382 } 2383 2384 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 2385 { 2386 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 2387 2388 PetscFunctionBegin; 2389 #if CUSPARSE_VERSION >= 11500 2390 PetscBool perm_identity; 2391 PetscCall(ISIdentity(perm,&perm_identity)); 2392 if (!info->levels && perm_identity) { 2393 PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B,A,perm,info)); 2394 } else 2395 #endif 2396 { 2397 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2398 PetscCall(MatICCFactorSymbolic_SeqAIJ(B,A,perm,info)); 2399 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2400 } 2401 PetscFunctionReturn(0); 2402 } 2403 2404 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 2405 { 2406 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 2407 2408 PetscFunctionBegin; 2409 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2410 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info)); 2411 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2412 PetscFunctionReturn(0); 2413 } 2414 2415 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 2416 { 2417 PetscFunctionBegin; 2418 *type = MATSOLVERCUSPARSE; 2419 PetscFunctionReturn(0); 2420 } 2421 2422 /*MC 2423 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 2424 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 2425 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 2426 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 2427 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 2428 algorithms are not recommended. This class does NOT support direct solver operations. 2429 2430 Level: beginner 2431 2432 .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 2433 M*/ 2434 2435 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 2436 { 2437 PetscInt n = A->rmap->n; 2438 2439 PetscFunctionBegin; 2440 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B)); 2441 PetscCall(MatSetSizes(*B,n,n,n,n)); 2442 (*B)->factortype = ftype; 2443 PetscCall(MatSetType(*B,MATSEQAIJCUSPARSE)); 2444 2445 if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B,PETSC_TRUE)); 2446 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2447 PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 2448 if (!A->boundtocpu) { 2449 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 2450 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 2451 } else { 2452 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 2453 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 2454 } 2455 PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU])); 2456 PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2457 PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2458 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2459 if (!A->boundtocpu) { 2460 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 2461 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 2462 } else { 2463 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 2464 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 2465 } 2466 PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2467 PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2468 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 2469 2470 PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL)); 2471 (*B)->canuseordering = PETSC_TRUE; 2472 PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse)); 2473 PetscFunctionReturn(0); 2474 } 2475 2476 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 2477 { 2478 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2479 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2480 #if CUSPARSE_VERSION >= 13500 2481 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 2482 #endif 2483 2484 PetscFunctionBegin; 2485 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 2486 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0)); 2487 if (A->factortype == MAT_FACTOR_NONE) { 2488 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 2489 PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2490 } 2491 #if CUSPARSE_VERSION >= 13500 2492 else if (fs->csrVal) { 2493 /* We have a factorized matrix on device and are able to copy it to host */ 2494 PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2495 } 2496 #endif 2497 else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for copying this type of factorized matrix from device to host"); 2498 PetscCall(PetscLogGpuToCpu(a->nz*sizeof(PetscScalar))); 2499 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0)); 2500 A->offloadmask = PETSC_OFFLOAD_BOTH; 2501 } 2502 PetscFunctionReturn(0); 2503 } 2504 2505 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 2506 { 2507 PetscFunctionBegin; 2508 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2509 *array = ((Mat_SeqAIJ*)A->data)->a; 2510 PetscFunctionReturn(0); 2511 } 2512 2513 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 2514 { 2515 PetscFunctionBegin; 2516 A->offloadmask = PETSC_OFFLOAD_CPU; 2517 *array = NULL; 2518 PetscFunctionReturn(0); 2519 } 2520 2521 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 2522 { 2523 PetscFunctionBegin; 2524 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2525 *array = ((Mat_SeqAIJ*)A->data)->a; 2526 PetscFunctionReturn(0); 2527 } 2528 2529 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 2530 { 2531 PetscFunctionBegin; 2532 *array = NULL; 2533 PetscFunctionReturn(0); 2534 } 2535 2536 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 2537 { 2538 PetscFunctionBegin; 2539 *array = ((Mat_SeqAIJ*)A->data)->a; 2540 PetscFunctionReturn(0); 2541 } 2542 2543 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 2544 { 2545 PetscFunctionBegin; 2546 A->offloadmask = PETSC_OFFLOAD_CPU; 2547 *array = NULL; 2548 PetscFunctionReturn(0); 2549 } 2550 2551 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype) 2552 { 2553 Mat_SeqAIJCUSPARSE *cusp; 2554 CsrMatrix *matrix; 2555 2556 PetscFunctionBegin; 2557 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2558 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 2559 cusp = static_cast<Mat_SeqAIJCUSPARSE*>(A->spptr); 2560 PetscCheck(cusp != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"cusp is NULL"); 2561 matrix = (CsrMatrix*)cusp->mat->mat; 2562 2563 if (i) { 2564 #if !defined(PETSC_USE_64BIT_INDICES) 2565 *i = matrix->row_offsets->data().get(); 2566 #else 2567 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices"); 2568 #endif 2569 } 2570 if (j) { 2571 #if !defined(PETSC_USE_64BIT_INDICES) 2572 *j = matrix->column_indices->data().get(); 2573 #else 2574 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices"); 2575 #endif 2576 } 2577 if (a) *a = matrix->values->data().get(); 2578 if (mtype) *mtype = PETSC_MEMTYPE_CUDA; 2579 PetscFunctionReturn(0); 2580 } 2581 2582 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 2583 { 2584 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2585 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 2586 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2587 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 2588 cusparseStatus_t stat; 2589 PetscBool both = PETSC_TRUE; 2590 2591 PetscFunctionBegin; 2592 PetscCheck(!A->boundtocpu,PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU"); 2593 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 2594 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 2595 CsrMatrix *matrix; 2596 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 2597 2598 PetscCheck(!a->nz || a->a,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values"); 2599 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0)); 2600 matrix->values->assign(a->a, a->a+a->nz); 2601 PetscCallCUDA(WaitForCUDA()); 2602 PetscCall(PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar))); 2603 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0)); 2604 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 2605 } else { 2606 PetscInt nnz; 2607 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0)); 2608 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format)); 2609 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 2610 delete cusparsestruct->workVector; 2611 delete cusparsestruct->rowoffsets_gpu; 2612 cusparsestruct->workVector = NULL; 2613 cusparsestruct->rowoffsets_gpu = NULL; 2614 try { 2615 if (a->compressedrow.use) { 2616 m = a->compressedrow.nrows; 2617 ii = a->compressedrow.i; 2618 ridx = a->compressedrow.rindex; 2619 } else { 2620 m = A->rmap->n; 2621 ii = a->i; 2622 ridx = NULL; 2623 } 2624 PetscCheck(ii,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data"); 2625 if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 2626 else nnz = a->nz; 2627 PetscCheck(!nnz || a->j,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data"); 2628 2629 /* create cusparse matrix */ 2630 cusparsestruct->nrows = m; 2631 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 2632 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr)); 2633 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO)); 2634 PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 2635 2636 PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar))); 2637 PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar))); 2638 PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar))); 2639 PetscCallCUDA(cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 2640 PetscCallCUDA(cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 2641 PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 2642 PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2643 2644 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 2645 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 2646 /* set the matrix */ 2647 CsrMatrix *mat= new CsrMatrix; 2648 mat->num_rows = m; 2649 mat->num_cols = A->cmap->n; 2650 mat->num_entries = nnz; 2651 mat->row_offsets = new THRUSTINTARRAY32(m+1); 2652 mat->row_offsets->assign(ii, ii + m+1); 2653 2654 mat->column_indices = new THRUSTINTARRAY32(nnz); 2655 mat->column_indices->assign(a->j, a->j+nnz); 2656 2657 mat->values = new THRUSTARRAY(nnz); 2658 if (a->a) mat->values->assign(a->a, a->a+nnz); 2659 2660 /* assign the pointer */ 2661 matstruct->mat = mat; 2662 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2663 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 2664 stat = cusparseCreateCsr(&matstruct->matDescr, 2665 mat->num_rows, mat->num_cols, mat->num_entries, 2666 mat->row_offsets->data().get(), mat->column_indices->data().get(), 2667 mat->values->data().get(), 2668 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2669 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat); 2670 } 2671 #endif 2672 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 2673 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2674 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2675 #else 2676 CsrMatrix *mat= new CsrMatrix; 2677 mat->num_rows = m; 2678 mat->num_cols = A->cmap->n; 2679 mat->num_entries = nnz; 2680 mat->row_offsets = new THRUSTINTARRAY32(m+1); 2681 mat->row_offsets->assign(ii, ii + m+1); 2682 2683 mat->column_indices = new THRUSTINTARRAY32(nnz); 2684 mat->column_indices->assign(a->j, a->j+nnz); 2685 2686 mat->values = new THRUSTARRAY(nnz); 2687 if (a->a) mat->values->assign(a->a, a->a+nnz); 2688 2689 cusparseHybMat_t hybMat; 2690 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 2691 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 2692 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 2693 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 2694 matstruct->descr, mat->values->data().get(), 2695 mat->row_offsets->data().get(), 2696 mat->column_indices->data().get(), 2697 hybMat, 0, partition);PetscCallCUSPARSE(stat); 2698 /* assign the pointer */ 2699 matstruct->mat = hybMat; 2700 2701 if (mat) { 2702 if (mat->values) delete (THRUSTARRAY*)mat->values; 2703 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 2704 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 2705 delete (CsrMatrix*)mat; 2706 } 2707 #endif 2708 } 2709 2710 /* assign the compressed row indices */ 2711 if (a->compressedrow.use) { 2712 cusparsestruct->workVector = new THRUSTARRAY(m); 2713 matstruct->cprowIndices = new THRUSTINTARRAY(m); 2714 matstruct->cprowIndices->assign(ridx,ridx+m); 2715 tmp = m; 2716 } else { 2717 cusparsestruct->workVector = NULL; 2718 matstruct->cprowIndices = NULL; 2719 tmp = 0; 2720 } 2721 PetscCall(PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar))); 2722 2723 /* assign the pointer */ 2724 cusparsestruct->mat = matstruct; 2725 } catch(char *ex) { 2726 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 2727 } 2728 PetscCallCUDA(WaitForCUDA()); 2729 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0)); 2730 cusparsestruct->nonzerostate = A->nonzerostate; 2731 } 2732 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 2733 } 2734 PetscFunctionReturn(0); 2735 } 2736 2737 struct VecCUDAPlusEquals 2738 { 2739 template <typename Tuple> 2740 __host__ __device__ 2741 void operator()(Tuple t) 2742 { 2743 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 2744 } 2745 }; 2746 2747 struct VecCUDAEquals 2748 { 2749 template <typename Tuple> 2750 __host__ __device__ 2751 void operator()(Tuple t) 2752 { 2753 thrust::get<1>(t) = thrust::get<0>(t); 2754 } 2755 }; 2756 2757 struct VecCUDAEqualsReverse 2758 { 2759 template <typename Tuple> 2760 __host__ __device__ 2761 void operator()(Tuple t) 2762 { 2763 thrust::get<0>(t) = thrust::get<1>(t); 2764 } 2765 }; 2766 2767 struct MatMatCusparse { 2768 PetscBool cisdense; 2769 PetscScalar *Bt; 2770 Mat X; 2771 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2772 PetscLogDouble flops; 2773 CsrMatrix *Bcsr; 2774 2775 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2776 cusparseSpMatDescr_t matSpBDescr; 2777 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2778 cusparseDnMatDescr_t matBDescr; 2779 cusparseDnMatDescr_t matCDescr; 2780 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 2781 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2782 void *dBuffer4; 2783 void *dBuffer5; 2784 #endif 2785 size_t mmBufferSize; 2786 void *mmBuffer; 2787 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2788 cusparseSpGEMMDescr_t spgemmDesc; 2789 #endif 2790 }; 2791 2792 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2793 { 2794 MatMatCusparse *mmdata = (MatMatCusparse *)data; 2795 2796 PetscFunctionBegin; 2797 PetscCallCUDA(cudaFree(mmdata->Bt)); 2798 delete mmdata->Bcsr; 2799 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2800 if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr)); 2801 if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2802 if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2803 if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2804 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2805 if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4)); 2806 if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5)); 2807 #endif 2808 if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2809 if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2)); 2810 #endif 2811 PetscCall(MatDestroy(&mmdata->X)); 2812 PetscCall(PetscFree(data)); 2813 PetscFunctionReturn(0); 2814 } 2815 2816 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 2817 2818 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2819 { 2820 Mat_Product *product = C->product; 2821 Mat A,B; 2822 PetscInt m,n,blda,clda; 2823 PetscBool flg,biscuda; 2824 Mat_SeqAIJCUSPARSE *cusp; 2825 cusparseStatus_t stat; 2826 cusparseOperation_t opA; 2827 const PetscScalar *barray; 2828 PetscScalar *carray; 2829 MatMatCusparse *mmdata; 2830 Mat_SeqAIJCUSPARSEMultStruct *mat; 2831 CsrMatrix *csrmat; 2832 2833 PetscFunctionBegin; 2834 MatCheckProduct(C,1); 2835 PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2836 mmdata = (MatMatCusparse*)product->data; 2837 A = product->A; 2838 B = product->B; 2839 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2840 PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2841 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2842 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2843 PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2844 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2845 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2846 switch (product->type) { 2847 case MATPRODUCT_AB: 2848 case MATPRODUCT_PtAP: 2849 mat = cusp->mat; 2850 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2851 m = A->rmap->n; 2852 n = B->cmap->n; 2853 break; 2854 case MATPRODUCT_AtB: 2855 if (!A->form_explicit_transpose) { 2856 mat = cusp->mat; 2857 opA = CUSPARSE_OPERATION_TRANSPOSE; 2858 } else { 2859 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2860 mat = cusp->matTranspose; 2861 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2862 } 2863 m = A->cmap->n; 2864 n = B->cmap->n; 2865 break; 2866 case MATPRODUCT_ABt: 2867 case MATPRODUCT_RARt: 2868 mat = cusp->mat; 2869 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2870 m = A->rmap->n; 2871 n = B->rmap->n; 2872 break; 2873 default: 2874 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2875 } 2876 PetscCheck(mat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2877 csrmat = (CsrMatrix*)mat->mat; 2878 /* if the user passed a CPU matrix, copy the data to the GPU */ 2879 PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda)); 2880 if (!biscuda) PetscCall(MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B)); 2881 PetscCall(MatDenseCUDAGetArrayRead(B,&barray)); 2882 2883 PetscCall(MatDenseGetLDA(B,&blda)); 2884 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2885 PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X,&carray)); 2886 PetscCall(MatDenseGetLDA(mmdata->X,&clda)); 2887 } else { 2888 PetscCall(MatDenseCUDAGetArrayWrite(C,&carray)); 2889 PetscCall(MatDenseGetLDA(C,&clda)); 2890 } 2891 2892 PetscCall(PetscLogGpuTimeBegin()); 2893 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2894 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2895 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2896 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2897 size_t mmBufferSize; 2898 if (mmdata->initialized && mmdata->Blda != blda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); mmdata->matBDescr = NULL;} 2899 if (!mmdata->matBDescr) { 2900 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL)); 2901 mmdata->Blda = blda; 2902 } 2903 2904 if (mmdata->initialized && mmdata->Clda != clda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); mmdata->matCDescr = NULL;} 2905 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2906 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL)); 2907 mmdata->Clda = clda; 2908 } 2909 2910 if (!mat->matDescr) { 2911 stat = cusparseCreateCsr(&mat->matDescr, 2912 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2913 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2914 csrmat->values->data().get(), 2915 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2916 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat); 2917 } 2918 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2919 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2920 mmdata->matCDescr,cusparse_scalartype, 2921 cusp->spmmAlg,&mmBufferSize);PetscCallCUSPARSE(stat); 2922 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2923 PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2924 PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer,mmBufferSize)); 2925 mmdata->mmBufferSize = mmBufferSize; 2926 } 2927 mmdata->initialized = PETSC_TRUE; 2928 } else { 2929 /* to be safe, always update pointers of the mats */ 2930 PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get())); 2931 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray)); 2932 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray)); 2933 } 2934 2935 /* do cusparseSpMM, which supports transpose on B */ 2936 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2937 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2938 mmdata->matCDescr,cusparse_scalartype, 2939 cusp->spmmAlg,mmdata->mmBuffer);PetscCallCUSPARSE(stat); 2940 #else 2941 PetscInt k; 2942 /* cusparseXcsrmm does not support transpose on B */ 2943 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2944 cublasHandle_t cublasv2handle; 2945 cublasStatus_t cerr; 2946 2947 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 2948 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2949 B->cmap->n,B->rmap->n, 2950 &PETSC_CUSPARSE_ONE ,barray,blda, 2951 &PETSC_CUSPARSE_ZERO,barray,blda, 2952 mmdata->Bt,B->cmap->n);PetscCallCUBLAS(cerr); 2953 blda = B->cmap->n; 2954 k = B->cmap->n; 2955 } else { 2956 k = B->rmap->n; 2957 } 2958 2959 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2960 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2961 csrmat->num_entries,mat->alpha_one,mat->descr, 2962 csrmat->values->data().get(), 2963 csrmat->row_offsets->data().get(), 2964 csrmat->column_indices->data().get(), 2965 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2966 carray,clda);PetscCallCUSPARSE(stat); 2967 #endif 2968 PetscCall(PetscLogGpuTimeEnd()); 2969 PetscCall(PetscLogGpuFlops(n*2.0*csrmat->num_entries)); 2970 PetscCall(MatDenseCUDARestoreArrayRead(B,&barray)); 2971 if (product->type == MATPRODUCT_RARt) { 2972 PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray)); 2973 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE)); 2974 } else if (product->type == MATPRODUCT_PtAP) { 2975 PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray)); 2976 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE)); 2977 } else { 2978 PetscCall(MatDenseCUDARestoreArrayWrite(C,&carray)); 2979 } 2980 if (mmdata->cisdense) { 2981 PetscCall(MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C)); 2982 } 2983 if (!biscuda) { 2984 PetscCall(MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B)); 2985 } 2986 PetscFunctionReturn(0); 2987 } 2988 2989 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2990 { 2991 Mat_Product *product = C->product; 2992 Mat A,B; 2993 PetscInt m,n; 2994 PetscBool cisdense,flg; 2995 MatMatCusparse *mmdata; 2996 Mat_SeqAIJCUSPARSE *cusp; 2997 2998 PetscFunctionBegin; 2999 MatCheckProduct(C,1); 3000 PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 3001 A = product->A; 3002 B = product->B; 3003 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 3004 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 3005 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3006 PetscCheck(cusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 3007 switch (product->type) { 3008 case MATPRODUCT_AB: 3009 m = A->rmap->n; 3010 n = B->cmap->n; 3011 break; 3012 case MATPRODUCT_AtB: 3013 m = A->cmap->n; 3014 n = B->cmap->n; 3015 break; 3016 case MATPRODUCT_ABt: 3017 m = A->rmap->n; 3018 n = B->rmap->n; 3019 break; 3020 case MATPRODUCT_PtAP: 3021 m = B->cmap->n; 3022 n = B->cmap->n; 3023 break; 3024 case MATPRODUCT_RARt: 3025 m = B->rmap->n; 3026 n = B->rmap->n; 3027 break; 3028 default: 3029 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 3030 } 3031 PetscCall(MatSetSizes(C,m,n,m,n)); 3032 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 3033 PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense)); 3034 PetscCall(MatSetType(C,MATSEQDENSECUDA)); 3035 3036 /* product data */ 3037 PetscCall(PetscNew(&mmdata)); 3038 mmdata->cisdense = cisdense; 3039 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 3040 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 3041 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 3042 PetscCallCUDA(cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar))); 3043 } 3044 #endif 3045 /* for these products we need intermediate storage */ 3046 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 3047 PetscCall(MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X)); 3048 PetscCall(MatSetType(mmdata->X,MATSEQDENSECUDA)); 3049 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 3050 PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n)); 3051 } else { 3052 PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n)); 3053 } 3054 } 3055 C->product->data = mmdata; 3056 C->product->destroy = MatDestroy_MatMatCusparse; 3057 3058 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 3059 PetscFunctionReturn(0); 3060 } 3061 3062 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 3063 { 3064 Mat_Product *product = C->product; 3065 Mat A,B; 3066 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 3067 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 3068 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 3069 CsrMatrix *Acsr,*Bcsr,*Ccsr; 3070 PetscBool flg; 3071 cusparseStatus_t stat; 3072 MatProductType ptype; 3073 MatMatCusparse *mmdata; 3074 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3075 cusparseSpMatDescr_t BmatSpDescr; 3076 #endif 3077 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 3078 3079 PetscFunctionBegin; 3080 MatCheckProduct(C,1); 3081 PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 3082 PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg)); 3083 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name); 3084 mmdata = (MatMatCusparse*)C->product->data; 3085 A = product->A; 3086 B = product->B; 3087 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 3088 mmdata->reusesym = PETSC_FALSE; 3089 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 3090 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 3091 Cmat = Ccusp->mat; 3092 PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 3093 Ccsr = (CsrMatrix*)Cmat->mat; 3094 PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 3095 goto finalize; 3096 } 3097 if (!c->nz) goto finalize; 3098 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 3099 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 3100 PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg)); 3101 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 3102 PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 3103 PetscCheck(!B->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 3104 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3105 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 3106 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 3107 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 3108 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 3109 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 3110 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3111 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 3112 3113 ptype = product->type; 3114 if (A->symmetric && ptype == MATPRODUCT_AtB) { 3115 ptype = MATPRODUCT_AB; 3116 PetscCheck(product->symbolic_used_the_fact_A_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric"); 3117 } 3118 if (B->symmetric && ptype == MATPRODUCT_ABt) { 3119 ptype = MATPRODUCT_AB; 3120 PetscCheck(product->symbolic_used_the_fact_B_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric"); 3121 } 3122 switch (ptype) { 3123 case MATPRODUCT_AB: 3124 Amat = Acusp->mat; 3125 Bmat = Bcusp->mat; 3126 break; 3127 case MATPRODUCT_AtB: 3128 Amat = Acusp->matTranspose; 3129 Bmat = Bcusp->mat; 3130 break; 3131 case MATPRODUCT_ABt: 3132 Amat = Acusp->mat; 3133 Bmat = Bcusp->matTranspose; 3134 break; 3135 default: 3136 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 3137 } 3138 Cmat = Ccusp->mat; 3139 PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 3140 PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 3141 PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 3142 Acsr = (CsrMatrix*)Amat->mat; 3143 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 3144 Ccsr = (CsrMatrix*)Cmat->mat; 3145 PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 3146 PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 3147 PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 3148 PetscCall(PetscLogGpuTimeBegin()); 3149 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3150 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 3151 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3152 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3153 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 3154 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3155 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 3156 mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 3157 #else 3158 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 3159 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3160 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 3161 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat); 3162 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 3163 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3164 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 3165 #endif 3166 #else 3167 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 3168 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 3169 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 3170 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 3171 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat); 3172 #endif 3173 PetscCall(PetscLogGpuFlops(mmdata->flops)); 3174 PetscCallCUDA(WaitForCUDA()); 3175 PetscCall(PetscLogGpuTimeEnd()); 3176 C->offloadmask = PETSC_OFFLOAD_GPU; 3177 finalize: 3178 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3179 PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz)); 3180 PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n")); 3181 PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax)); 3182 c->reallocs = 0; 3183 C->info.mallocs += 0; 3184 C->info.nz_unneeded = 0; 3185 C->assembled = C->was_assembled = PETSC_TRUE; 3186 C->num_ass++; 3187 PetscFunctionReturn(0); 3188 } 3189 3190 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 3191 { 3192 Mat_Product *product = C->product; 3193 Mat A,B; 3194 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 3195 Mat_SeqAIJ *a,*b,*c; 3196 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 3197 CsrMatrix *Acsr,*Bcsr,*Ccsr; 3198 PetscInt i,j,m,n,k; 3199 PetscBool flg; 3200 cusparseStatus_t stat; 3201 MatProductType ptype; 3202 MatMatCusparse *mmdata; 3203 PetscLogDouble flops; 3204 PetscBool biscompressed,ciscompressed; 3205 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3206 int64_t C_num_rows1, C_num_cols1, C_nnz1; 3207 cusparseSpMatDescr_t BmatSpDescr; 3208 #else 3209 int cnz; 3210 #endif 3211 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 3212 3213 PetscFunctionBegin; 3214 MatCheckProduct(C,1); 3215 PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 3216 A = product->A; 3217 B = product->B; 3218 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 3219 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 3220 PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg)); 3221 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 3222 a = (Mat_SeqAIJ*)A->data; 3223 b = (Mat_SeqAIJ*)B->data; 3224 /* product data */ 3225 PetscCall(PetscNew(&mmdata)); 3226 C->product->data = mmdata; 3227 C->product->destroy = MatDestroy_MatMatCusparse; 3228 3229 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3230 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 3231 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 3232 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 3233 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 3234 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 3235 3236 ptype = product->type; 3237 if (A->symmetric && ptype == MATPRODUCT_AtB) { 3238 ptype = MATPRODUCT_AB; 3239 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 3240 } 3241 if (B->symmetric && ptype == MATPRODUCT_ABt) { 3242 ptype = MATPRODUCT_AB; 3243 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 3244 } 3245 biscompressed = PETSC_FALSE; 3246 ciscompressed = PETSC_FALSE; 3247 switch (ptype) { 3248 case MATPRODUCT_AB: 3249 m = A->rmap->n; 3250 n = B->cmap->n; 3251 k = A->cmap->n; 3252 Amat = Acusp->mat; 3253 Bmat = Bcusp->mat; 3254 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3255 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3256 break; 3257 case MATPRODUCT_AtB: 3258 m = A->cmap->n; 3259 n = B->cmap->n; 3260 k = A->rmap->n; 3261 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3262 Amat = Acusp->matTranspose; 3263 Bmat = Bcusp->mat; 3264 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3265 break; 3266 case MATPRODUCT_ABt: 3267 m = A->rmap->n; 3268 n = B->rmap->n; 3269 k = A->cmap->n; 3270 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 3271 Amat = Acusp->mat; 3272 Bmat = Bcusp->matTranspose; 3273 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3274 break; 3275 default: 3276 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 3277 } 3278 3279 /* create cusparse matrix */ 3280 PetscCall(MatSetSizes(C,m,n,m,n)); 3281 PetscCall(MatSetType(C,MATSEQAIJCUSPARSE)); 3282 c = (Mat_SeqAIJ*)C->data; 3283 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 3284 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3285 Ccsr = new CsrMatrix; 3286 3287 c->compressedrow.use = ciscompressed; 3288 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 3289 c->compressedrow.nrows = a->compressedrow.nrows; 3290 PetscCall(PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex)); 3291 PetscCall(PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows)); 3292 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 3293 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 3294 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 3295 } else { 3296 c->compressedrow.nrows = 0; 3297 c->compressedrow.i = NULL; 3298 c->compressedrow.rindex = NULL; 3299 Ccusp->workVector = NULL; 3300 Cmat->cprowIndices = NULL; 3301 } 3302 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 3303 Ccusp->mat = Cmat; 3304 Ccusp->mat->mat = Ccsr; 3305 Ccsr->num_rows = Ccusp->nrows; 3306 Ccsr->num_cols = n; 3307 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 3308 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 3309 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 3310 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 3311 PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar))); 3312 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar))); 3313 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 3314 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 3315 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 3316 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 3317 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 3318 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 3319 c->nz = 0; 3320 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3321 Ccsr->values = new THRUSTARRAY(c->nz); 3322 goto finalizesym; 3323 } 3324 3325 PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 3326 PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 3327 Acsr = (CsrMatrix*)Amat->mat; 3328 if (!biscompressed) { 3329 Bcsr = (CsrMatrix*)Bmat->mat; 3330 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3331 BmatSpDescr = Bmat->matDescr; 3332 #endif 3333 } else { /* we need to use row offsets for the full matrix */ 3334 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 3335 Bcsr = new CsrMatrix; 3336 Bcsr->num_rows = B->rmap->n; 3337 Bcsr->num_cols = cBcsr->num_cols; 3338 Bcsr->num_entries = cBcsr->num_entries; 3339 Bcsr->column_indices = cBcsr->column_indices; 3340 Bcsr->values = cBcsr->values; 3341 if (!Bcusp->rowoffsets_gpu) { 3342 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3343 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 3344 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt))); 3345 } 3346 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 3347 mmdata->Bcsr = Bcsr; 3348 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3349 if (Bcsr->num_rows && Bcsr->num_cols) { 3350 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 3351 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 3352 Bcsr->values->data().get(), 3353 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 3354 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 3355 } 3356 BmatSpDescr = mmdata->matSpBDescr; 3357 #endif 3358 } 3359 PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 3360 PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 3361 /* precompute flops count */ 3362 if (ptype == MATPRODUCT_AB) { 3363 for (i=0, flops = 0; i<A->rmap->n; i++) { 3364 const PetscInt st = a->i[i]; 3365 const PetscInt en = a->i[i+1]; 3366 for (j=st; j<en; j++) { 3367 const PetscInt brow = a->j[j]; 3368 flops += 2.*(b->i[brow+1] - b->i[brow]); 3369 } 3370 } 3371 } else if (ptype == MATPRODUCT_AtB) { 3372 for (i=0, flops = 0; i<A->rmap->n; i++) { 3373 const PetscInt anzi = a->i[i+1] - a->i[i]; 3374 const PetscInt bnzi = b->i[i+1] - b->i[i]; 3375 flops += (2.*anzi)*bnzi; 3376 } 3377 } else { /* TODO */ 3378 flops = 0.; 3379 } 3380 3381 mmdata->flops = flops; 3382 PetscCall(PetscLogGpuTimeBegin()); 3383 3384 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3385 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3386 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 3387 NULL, NULL, NULL, 3388 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 3389 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 3390 PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 3391 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3392 { 3393 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 3394 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 3395 */ 3396 void* dBuffer1 = NULL; 3397 void* dBuffer2 = NULL; 3398 void* dBuffer3 = NULL; 3399 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 3400 size_t bufferSize1 = 0; 3401 size_t bufferSize2 = 0; 3402 size_t bufferSize3 = 0; 3403 size_t bufferSize4 = 0; 3404 size_t bufferSize5 = 0; 3405 3406 /*----------------------------------------------------------------------*/ 3407 /* ask bufferSize1 bytes for external memory */ 3408 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 3409 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 3410 &bufferSize1, NULL);PetscCallCUSPARSE(stat); 3411 PetscCallCUDA(cudaMalloc((void**) &dBuffer1, bufferSize1)); 3412 /* inspect the matrices A and B to understand the memory requirement for the next step */ 3413 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 3414 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 3415 &bufferSize1, dBuffer1);PetscCallCUSPARSE(stat); 3416 3417 /*----------------------------------------------------------------------*/ 3418 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 3419 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 3420 &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);PetscCallCUSPARSE(stat); 3421 PetscCallCUDA(cudaMalloc((void**) &dBuffer2, bufferSize2)); 3422 PetscCallCUDA(cudaMalloc((void**) &dBuffer3, bufferSize3)); 3423 PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4)); 3424 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 3425 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 3426 &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);PetscCallCUSPARSE(stat); 3427 PetscCallCUDA(cudaFree(dBuffer1)); 3428 PetscCallCUDA(cudaFree(dBuffer2)); 3429 3430 /*----------------------------------------------------------------------*/ 3431 /* get matrix C non-zero entries C_nnz1 */ 3432 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3433 c->nz = (PetscInt) C_nnz1; 3434 /* allocate matrix C */ 3435 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3436 Ccsr->values = new THRUSTARRAY(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3437 /* update matC with the new pointers */ 3438 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 3439 Ccsr->values->data().get());PetscCallCUSPARSE(stat); 3440 3441 /*----------------------------------------------------------------------*/ 3442 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 3443 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 3444 &bufferSize5, NULL);PetscCallCUSPARSE(stat); 3445 PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5)); 3446 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 3447 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 3448 &bufferSize5, mmdata->dBuffer5);PetscCallCUSPARSE(stat); 3449 PetscCallCUDA(cudaFree(dBuffer3)); 3450 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 3451 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3452 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 3453 mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 3454 PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024)); 3455 } 3456 #else 3457 size_t bufSize2; 3458 /* ask bufferSize bytes for external memory */ 3459 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 3460 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3461 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 3462 mmdata->spgemmDesc, &bufSize2, NULL);PetscCallCUSPARSE(stat); 3463 PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2)); 3464 /* inspect the matrices A and B to understand the memory requirement for the next step */ 3465 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 3466 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3467 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 3468 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);PetscCallCUSPARSE(stat); 3469 /* ask bufferSize again bytes for external memory */ 3470 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 3471 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3472 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 3473 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);PetscCallCUSPARSE(stat); 3474 /* The CUSPARSE documentation is not clear, nor the API 3475 We need both buffers to perform the operations properly! 3476 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 3477 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 3478 is stored in the descriptor! What a messy API... */ 3479 PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize)); 3480 /* compute the intermediate product of A * B */ 3481 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 3482 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3483 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 3484 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat); 3485 /* get matrix C non-zero entries C_nnz1 */ 3486 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3487 c->nz = (PetscInt) C_nnz1; 3488 PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024)); 3489 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3490 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3491 Ccsr->values = new THRUSTARRAY(c->nz); 3492 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3493 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 3494 Ccsr->values->data().get());PetscCallCUSPARSE(stat); 3495 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 3496 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3497 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 3498 #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3499 #else 3500 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST)); 3501 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, 3502 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 3503 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 3504 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 3505 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);PetscCallCUSPARSE(stat); 3506 c->nz = cnz; 3507 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3508 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3509 Ccsr->values = new THRUSTARRAY(c->nz); 3510 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3511 3512 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3513 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 3514 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 3515 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 3516 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 3517 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 3518 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 3519 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 3520 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat); 3521 #endif 3522 PetscCall(PetscLogGpuFlops(mmdata->flops)); 3523 PetscCall(PetscLogGpuTimeEnd()); 3524 finalizesym: 3525 c->singlemalloc = PETSC_FALSE; 3526 c->free_a = PETSC_TRUE; 3527 c->free_ij = PETSC_TRUE; 3528 PetscCall(PetscMalloc1(m+1,&c->i)); 3529 PetscCall(PetscMalloc1(c->nz,&c->j)); 3530 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 3531 PetscInt *d_i = c->i; 3532 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 3533 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 3534 ii = *Ccsr->row_offsets; 3535 jj = *Ccsr->column_indices; 3536 if (ciscompressed) d_i = c->compressedrow.i; 3537 PetscCallCUDA(cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 3538 PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 3539 } else { 3540 PetscInt *d_i = c->i; 3541 if (ciscompressed) d_i = c->compressedrow.i; 3542 PetscCallCUDA(cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 3543 PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 3544 } 3545 if (ciscompressed) { /* need to expand host row offsets */ 3546 PetscInt r = 0; 3547 c->i[0] = 0; 3548 for (k = 0; k < c->compressedrow.nrows; k++) { 3549 const PetscInt next = c->compressedrow.rindex[k]; 3550 const PetscInt old = c->compressedrow.i[k]; 3551 for (; r < next; r++) c->i[r+1] = old; 3552 } 3553 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 3554 } 3555 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt))); 3556 PetscCall(PetscMalloc1(m,&c->ilen)); 3557 PetscCall(PetscMalloc1(m,&c->imax)); 3558 c->maxnz = c->nz; 3559 c->nonzerorowcnt = 0; 3560 c->rmax = 0; 3561 for (k = 0; k < m; k++) { 3562 const PetscInt nn = c->i[k+1] - c->i[k]; 3563 c->ilen[k] = c->imax[k] = nn; 3564 c->nonzerorowcnt += (PetscInt)!!nn; 3565 c->rmax = PetscMax(c->rmax,nn); 3566 } 3567 PetscCall(MatMarkDiagonal_SeqAIJ(C)); 3568 PetscCall(PetscMalloc1(c->nz,&c->a)); 3569 Ccsr->num_entries = c->nz; 3570 3571 C->nonzerostate++; 3572 PetscCall(PetscLayoutSetUp(C->rmap)); 3573 PetscCall(PetscLayoutSetUp(C->cmap)); 3574 Ccusp->nonzerostate = C->nonzerostate; 3575 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3576 C->preallocated = PETSC_TRUE; 3577 C->assembled = PETSC_FALSE; 3578 C->was_assembled = PETSC_FALSE; 3579 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 3580 mmdata->reusesym = PETSC_TRUE; 3581 C->offloadmask = PETSC_OFFLOAD_GPU; 3582 } 3583 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3584 PetscFunctionReturn(0); 3585 } 3586 3587 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 3588 3589 /* handles sparse or dense B */ 3590 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 3591 { 3592 Mat_Product *product = mat->product; 3593 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 3594 3595 PetscFunctionBegin; 3596 MatCheckProduct(mat,1); 3597 PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense)); 3598 if (!product->A->boundtocpu && !product->B->boundtocpu) { 3599 PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp)); 3600 } 3601 if (product->type == MATPRODUCT_ABC) { 3602 Ciscusp = PETSC_FALSE; 3603 if (!product->C->boundtocpu) { 3604 PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp)); 3605 } 3606 } 3607 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 3608 PetscBool usecpu = PETSC_FALSE; 3609 switch (product->type) { 3610 case MATPRODUCT_AB: 3611 if (product->api_user) { 3612 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat"); 3613 PetscCall(PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL)); 3614 PetscOptionsEnd(); 3615 } else { 3616 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat"); 3617 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL)); 3618 PetscOptionsEnd(); 3619 } 3620 break; 3621 case MATPRODUCT_AtB: 3622 if (product->api_user) { 3623 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat"); 3624 PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL)); 3625 PetscOptionsEnd(); 3626 } else { 3627 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat"); 3628 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL)); 3629 PetscOptionsEnd(); 3630 } 3631 break; 3632 case MATPRODUCT_PtAP: 3633 if (product->api_user) { 3634 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat"); 3635 PetscCall(PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL)); 3636 PetscOptionsEnd(); 3637 } else { 3638 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat"); 3639 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL)); 3640 PetscOptionsEnd(); 3641 } 3642 break; 3643 case MATPRODUCT_RARt: 3644 if (product->api_user) { 3645 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat"); 3646 PetscCall(PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL)); 3647 PetscOptionsEnd(); 3648 } else { 3649 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat"); 3650 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL)); 3651 PetscOptionsEnd(); 3652 } 3653 break; 3654 case MATPRODUCT_ABC: 3655 if (product->api_user) { 3656 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat"); 3657 PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL)); 3658 PetscOptionsEnd(); 3659 } else { 3660 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat"); 3661 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL)); 3662 PetscOptionsEnd(); 3663 } 3664 break; 3665 default: 3666 break; 3667 } 3668 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 3669 } 3670 /* dispatch */ 3671 if (isdense) { 3672 switch (product->type) { 3673 case MATPRODUCT_AB: 3674 case MATPRODUCT_AtB: 3675 case MATPRODUCT_ABt: 3676 case MATPRODUCT_PtAP: 3677 case MATPRODUCT_RARt: 3678 if (product->A->boundtocpu) { 3679 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 3680 } else { 3681 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 3682 } 3683 break; 3684 case MATPRODUCT_ABC: 3685 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3686 break; 3687 default: 3688 break; 3689 } 3690 } else if (Biscusp && Ciscusp) { 3691 switch (product->type) { 3692 case MATPRODUCT_AB: 3693 case MATPRODUCT_AtB: 3694 case MATPRODUCT_ABt: 3695 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3696 break; 3697 case MATPRODUCT_PtAP: 3698 case MATPRODUCT_RARt: 3699 case MATPRODUCT_ABC: 3700 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3701 break; 3702 default: 3703 break; 3704 } 3705 } else { /* fallback for AIJ */ 3706 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 3707 } 3708 PetscFunctionReturn(0); 3709 } 3710 3711 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 3712 { 3713 PetscFunctionBegin; 3714 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE)); 3715 PetscFunctionReturn(0); 3716 } 3717 3718 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 3719 { 3720 PetscFunctionBegin; 3721 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE)); 3722 PetscFunctionReturn(0); 3723 } 3724 3725 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 3726 { 3727 PetscFunctionBegin; 3728 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE)); 3729 PetscFunctionReturn(0); 3730 } 3731 3732 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3733 { 3734 PetscFunctionBegin; 3735 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE)); 3736 PetscFunctionReturn(0); 3737 } 3738 3739 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 3740 { 3741 PetscFunctionBegin; 3742 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE)); 3743 PetscFunctionReturn(0); 3744 } 3745 3746 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 3747 { 3748 int i = blockIdx.x*blockDim.x + threadIdx.x; 3749 if (i < n) y[idx[i]] += x[i]; 3750 } 3751 3752 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 3753 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 3754 { 3755 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3756 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 3757 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3758 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 3759 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3760 PetscBool compressed; 3761 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3762 PetscInt nx,ny; 3763 #endif 3764 3765 PetscFunctionBegin; 3766 PetscCheck(!herm || trans,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported"); 3767 if (!a->nz) { 3768 if (!yy) PetscCall(VecSet_SeqCUDA(zz,0)); 3769 else PetscCall(VecCopy_SeqCUDA(yy,zz)); 3770 PetscFunctionReturn(0); 3771 } 3772 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 3773 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3774 if (!trans) { 3775 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3776 PetscCheck(matstruct,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3777 } else { 3778 if (herm || !A->form_explicit_transpose) { 3779 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3780 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3781 } else { 3782 if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3783 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 3784 } 3785 } 3786 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3787 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3788 3789 try { 3790 PetscCall(VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray)); 3791 if (yy == zz) PetscCall(VecCUDAGetArray(zz,&zarray)); /* read & write zz, so need to get uptodate zarray on GPU */ 3792 else PetscCall(VecCUDAGetArrayWrite(zz,&zarray)); /* write zz, so no need to init zarray on GPU */ 3793 3794 PetscCall(PetscLogGpuTimeBegin()); 3795 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3796 /* z = A x + beta y. 3797 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3798 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3799 */ 3800 xptr = xarray; 3801 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3802 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3803 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3804 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3805 allocated to accommodate different uses. So we get the length info directly from mat. 3806 */ 3807 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3808 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3809 nx = mat->num_cols; 3810 ny = mat->num_rows; 3811 } 3812 #endif 3813 } else { 3814 /* z = A^T x + beta y 3815 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3816 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3817 */ 3818 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3819 dptr = zarray; 3820 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3821 if (compressed) { /* Scatter x to work vector */ 3822 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3823 thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3824 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3825 VecCUDAEqualsReverse()); 3826 } 3827 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3828 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3829 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3830 nx = mat->num_rows; 3831 ny = mat->num_cols; 3832 } 3833 #endif 3834 } 3835 3836 /* csr_spmv does y = alpha op(A) x + beta y */ 3837 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3838 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3839 PetscCheck(opA >= 0 && opA <= 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3840 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3841 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype)); 3842 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype)); 3843 PetscCallCUSPARSE(cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 3844 matstruct->matDescr, 3845 matstruct->cuSpMV[opA].vecXDescr, beta, 3846 matstruct->cuSpMV[opA].vecYDescr, 3847 cusparse_scalartype, 3848 cusparsestruct->spmvAlg, 3849 &matstruct->cuSpMV[opA].spmvBufferSize)); 3850 PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize)); 3851 3852 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3853 } else { 3854 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3855 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr)); 3856 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr)); 3857 } 3858 3859 PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, 3860 matstruct->alpha_one, 3861 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 3862 matstruct->cuSpMV[opA].vecXDescr, 3863 beta, 3864 matstruct->cuSpMV[opA].vecYDescr, 3865 cusparse_scalartype, 3866 cusparsestruct->spmvAlg, 3867 matstruct->cuSpMV[opA].spmvBuffer)); 3868 #else 3869 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3870 PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, 3871 mat->num_rows, mat->num_cols, 3872 mat->num_entries, matstruct->alpha_one, matstruct->descr, 3873 mat->values->data().get(), mat->row_offsets->data().get(), 3874 mat->column_indices->data().get(), xptr, beta, 3875 dptr)); 3876 #endif 3877 } else { 3878 if (cusparsestruct->nrows) { 3879 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3880 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3881 #else 3882 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3883 PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, 3884 matstruct->alpha_one, matstruct->descr, hybMat, 3885 xptr, beta, 3886 dptr)); 3887 #endif 3888 } 3889 } 3890 PetscCall(PetscLogGpuTimeEnd()); 3891 3892 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3893 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3894 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3895 PetscCall(VecCopy_SeqCUDA(yy,zz)); /* zz = yy */ 3896 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3897 PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */ 3898 } 3899 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3900 PetscCall(VecSet_SeqCUDA(zz,0)); 3901 } 3902 3903 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3904 if (compressed) { 3905 PetscCall(PetscLogGpuTimeBegin()); 3906 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3907 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3908 prevent that. So I just add a ScatterAdd kernel. 3909 */ 3910 #if 0 3911 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3912 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3913 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3914 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3915 VecCUDAPlusEquals()); 3916 #else 3917 PetscInt n = matstruct->cprowIndices->size(); 3918 ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 3919 #endif 3920 PetscCall(PetscLogGpuTimeEnd()); 3921 } 3922 } else { 3923 if (yy && yy != zz) { 3924 PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */ 3925 } 3926 } 3927 PetscCall(VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray)); 3928 if (yy == zz) PetscCall(VecCUDARestoreArray(zz,&zarray)); 3929 else PetscCall(VecCUDARestoreArrayWrite(zz,&zarray)); 3930 } catch(char *ex) { 3931 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 3932 } 3933 if (yy) { 3934 PetscCall(PetscLogGpuFlops(2.0*a->nz)); 3935 } else { 3936 PetscCall(PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt)); 3937 } 3938 PetscFunctionReturn(0); 3939 } 3940 3941 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3942 { 3943 PetscFunctionBegin; 3944 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE)); 3945 PetscFunctionReturn(0); 3946 } 3947 3948 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 3949 { 3950 PetscObjectState onnz = A->nonzerostate; 3951 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3952 3953 PetscFunctionBegin; 3954 PetscCall(MatAssemblyEnd_SeqAIJ(A,mode)); 3955 if (onnz != A->nonzerostate && cusp->deviceMat) { 3956 3957 PetscCall(PetscInfo(A,"Destroy device mat since nonzerostate changed\n")); 3958 PetscCallCUDA(cudaFree(cusp->deviceMat)); 3959 cusp->deviceMat = NULL; 3960 } 3961 PetscFunctionReturn(0); 3962 } 3963 3964 /* --------------------------------------------------------------------------------*/ 3965 /*@ 3966 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 3967 (the default parallel PETSc format). This matrix will ultimately pushed down 3968 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 3969 assembly performance the user should preallocate the matrix storage by setting 3970 the parameter nz (or the array nnz). By setting these parameters accurately, 3971 performance during matrix assembly can be increased by more than a factor of 50. 3972 3973 Collective 3974 3975 Input Parameters: 3976 + comm - MPI communicator, set to PETSC_COMM_SELF 3977 . m - number of rows 3978 . n - number of columns 3979 . nz - number of nonzeros per row (same for all rows) 3980 - nnz - array containing the number of nonzeros in the various rows 3981 (possibly different for each row) or NULL 3982 3983 Output Parameter: 3984 . A - the matrix 3985 3986 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 3987 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3988 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 3989 3990 Notes: 3991 If nnz is given then nz is ignored 3992 3993 The AIJ format (also called the Yale sparse matrix format or 3994 compressed row storage), is fully compatible with standard Fortran 77 3995 storage. That is, the stored row and column indices can begin at 3996 either one (as in Fortran) or zero. See the users' manual for details. 3997 3998 Specify the preallocated storage with either nz or nnz (not both). 3999 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 4000 allocation. For large problems you MUST preallocate memory or you 4001 will get TERRIBLE performance, see the users' manual chapter on matrices. 4002 4003 By default, this format uses inodes (identical nodes) when possible, to 4004 improve numerical efficiency of matrix-vector products and solves. We 4005 search for consecutive rows with the same nonzero structure, thereby 4006 reusing matrix information to achieve increased efficiency. 4007 4008 Level: intermediate 4009 4010 .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE` 4011 @*/ 4012 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 4013 { 4014 PetscFunctionBegin; 4015 PetscCall(MatCreate(comm,A)); 4016 PetscCall(MatSetSizes(*A,m,n,m,n)); 4017 PetscCall(MatSetType(*A,MATSEQAIJCUSPARSE)); 4018 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz)); 4019 PetscFunctionReturn(0); 4020 } 4021 4022 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 4023 { 4024 PetscFunctionBegin; 4025 if (A->factortype == MAT_FACTOR_NONE) { 4026 PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr)); 4027 } else { 4028 PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr)); 4029 } 4030 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL)); 4031 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL)); 4032 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL)); 4033 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL)); 4034 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL)); 4035 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL)); 4036 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL)); 4037 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 4038 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 4039 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL)); 4040 PetscCall(MatDestroy_SeqAIJ(A)); 4041 PetscFunctionReturn(0); 4042 } 4043 4044 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 4045 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 4046 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 4047 { 4048 PetscFunctionBegin; 4049 PetscCall(MatDuplicate_SeqAIJ(A,cpvalues,B)); 4050 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B)); 4051 PetscFunctionReturn(0); 4052 } 4053 4054 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 4055 { 4056 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 4057 Mat_SeqAIJCUSPARSE *cy; 4058 Mat_SeqAIJCUSPARSE *cx; 4059 PetscScalar *ay; 4060 const PetscScalar *ax; 4061 CsrMatrix *csry,*csrx; 4062 4063 PetscFunctionBegin; 4064 cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 4065 cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 4066 if (X->ops->axpy != Y->ops->axpy) { 4067 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE)); 4068 PetscCall(MatAXPY_SeqAIJ(Y,a,X,str)); 4069 PetscFunctionReturn(0); 4070 } 4071 /* if we are here, it means both matrices are bound to GPU */ 4072 PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y)); 4073 PetscCall(MatSeqAIJCUSPARSECopyToGPU(X)); 4074 PetscCheck(cy->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 4075 PetscCheck(cx->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 4076 csry = (CsrMatrix*)cy->mat->mat; 4077 csrx = (CsrMatrix*)cx->mat->mat; 4078 /* see if we can turn this into a cublas axpy */ 4079 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 4080 bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 4081 if (eq) { 4082 eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 4083 } 4084 if (eq) str = SAME_NONZERO_PATTERN; 4085 } 4086 /* spgeam is buggy with one column */ 4087 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 4088 4089 if (str == SUBSET_NONZERO_PATTERN) { 4090 PetscScalar b = 1.0; 4091 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4092 size_t bufferSize; 4093 void *buffer; 4094 #endif 4095 4096 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax)); 4097 PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 4098 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST)); 4099 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4100 PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 4101 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 4102 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 4103 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize)); 4104 PetscCallCUDA(cudaMalloc(&buffer,bufferSize)); 4105 PetscCall(PetscLogGpuTimeBegin()); 4106 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 4107 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 4108 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 4109 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer)); 4110 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 4111 PetscCall(PetscLogGpuTimeEnd()); 4112 PetscCallCUDA(cudaFree(buffer)); 4113 #else 4114 PetscCall(PetscLogGpuTimeBegin()); 4115 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 4116 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 4117 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 4118 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get())); 4119 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 4120 PetscCall(PetscLogGpuTimeEnd()); 4121 #endif 4122 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE)); 4123 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax)); 4124 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 4125 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 4126 } else if (str == SAME_NONZERO_PATTERN) { 4127 cublasHandle_t cublasv2handle; 4128 PetscBLASInt one = 1, bnz = 1; 4129 4130 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax)); 4131 PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 4132 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 4133 PetscCall(PetscBLASIntCast(x->nz,&bnz)); 4134 PetscCall(PetscLogGpuTimeBegin()); 4135 PetscCallCUBLAS(cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one)); 4136 PetscCall(PetscLogGpuFlops(2.0*bnz)); 4137 PetscCall(PetscLogGpuTimeEnd()); 4138 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax)); 4139 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 4140 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 4141 } else { 4142 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE)); 4143 PetscCall(MatAXPY_SeqAIJ(Y,a,X,str)); 4144 } 4145 PetscFunctionReturn(0); 4146 } 4147 4148 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 4149 { 4150 Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 4151 PetscScalar *ay; 4152 cublasHandle_t cublasv2handle; 4153 PetscBLASInt one = 1, bnz = 1; 4154 4155 PetscFunctionBegin; 4156 PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 4157 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 4158 PetscCall(PetscBLASIntCast(y->nz,&bnz)); 4159 PetscCall(PetscLogGpuTimeBegin()); 4160 PetscCallCUBLAS(cublasXscal(cublasv2handle,bnz,&a,ay,one)); 4161 PetscCall(PetscLogGpuFlops(bnz)); 4162 PetscCall(PetscLogGpuTimeEnd()); 4163 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 4164 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 4165 PetscFunctionReturn(0); 4166 } 4167 4168 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 4169 { 4170 PetscBool both = PETSC_FALSE; 4171 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4172 4173 PetscFunctionBegin; 4174 if (A->factortype == MAT_FACTOR_NONE) { 4175 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 4176 if (spptr->mat) { 4177 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 4178 if (matrix->values) { 4179 both = PETSC_TRUE; 4180 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 4181 } 4182 } 4183 if (spptr->matTranspose) { 4184 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 4185 if (matrix->values) { 4186 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 4187 } 4188 } 4189 } 4190 PetscCall(PetscArrayzero(a->a,a->i[A->rmap->n])); 4191 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4192 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 4193 else A->offloadmask = PETSC_OFFLOAD_CPU; 4194 PetscFunctionReturn(0); 4195 } 4196 4197 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 4198 { 4199 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4200 4201 PetscFunctionBegin; 4202 if (A->factortype != MAT_FACTOR_NONE) { 4203 A->boundtocpu = flg; 4204 PetscFunctionReturn(0); 4205 } 4206 if (flg) { 4207 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 4208 4209 A->ops->scale = MatScale_SeqAIJ; 4210 A->ops->axpy = MatAXPY_SeqAIJ; 4211 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 4212 A->ops->mult = MatMult_SeqAIJ; 4213 A->ops->multadd = MatMultAdd_SeqAIJ; 4214 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 4215 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 4216 A->ops->multhermitiantranspose = NULL; 4217 A->ops->multhermitiantransposeadd = NULL; 4218 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 4219 PetscCall(PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps))); 4220 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL)); 4221 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL)); 4222 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL)); 4223 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 4224 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 4225 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL)); 4226 } else { 4227 A->ops->scale = MatScale_SeqAIJCUSPARSE; 4228 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 4229 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 4230 A->ops->mult = MatMult_SeqAIJCUSPARSE; 4231 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 4232 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 4233 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 4234 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 4235 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 4236 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 4237 a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 4238 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 4239 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 4240 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 4241 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 4242 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 4243 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE; 4244 4245 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE)); 4246 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 4247 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 4248 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE)); 4249 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE)); 4250 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 4251 } 4252 A->boundtocpu = flg; 4253 if (flg && a->inode.size) { 4254 a->inode.use = PETSC_TRUE; 4255 } else { 4256 a->inode.use = PETSC_FALSE; 4257 } 4258 PetscFunctionReturn(0); 4259 } 4260 4261 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 4262 { 4263 Mat B; 4264 4265 PetscFunctionBegin; 4266 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */ 4267 if (reuse == MAT_INITIAL_MATRIX) { 4268 PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat)); 4269 } else if (reuse == MAT_REUSE_MATRIX) { 4270 PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN)); 4271 } 4272 B = *newmat; 4273 4274 PetscCall(PetscFree(B->defaultvectype)); 4275 PetscCall(PetscStrallocpy(VECCUDA,&B->defaultvectype)); 4276 4277 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 4278 if (B->factortype == MAT_FACTOR_NONE) { 4279 Mat_SeqAIJCUSPARSE *spptr; 4280 PetscCall(PetscNew(&spptr)); 4281 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 4282 PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream)); 4283 spptr->format = MAT_CUSPARSE_CSR; 4284 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4285 #if CUSPARSE_VERSION > 11301 4286 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 4287 #else 4288 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 4289 #endif 4290 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 4291 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 4292 #endif 4293 B->spptr = spptr; 4294 } else { 4295 Mat_SeqAIJCUSPARSETriFactors *spptr; 4296 4297 PetscCall(PetscNew(&spptr)); 4298 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 4299 PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream)); 4300 B->spptr = spptr; 4301 } 4302 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 4303 } 4304 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 4305 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 4306 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 4307 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 4308 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 4309 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 4310 4311 PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE)); 4312 PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE)); 4313 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE)); 4314 #if defined(PETSC_HAVE_HYPRE) 4315 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE)); 4316 #endif 4317 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE)); 4318 PetscFunctionReturn(0); 4319 } 4320 4321 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 4322 { 4323 PetscFunctionBegin; 4324 PetscCall(MatCreate_SeqAIJ(B)); 4325 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B)); 4326 PetscFunctionReturn(0); 4327 } 4328 4329 /*MC 4330 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 4331 4332 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 4333 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 4334 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 4335 4336 Options Database Keys: 4337 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 4338 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 4339 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 4340 + -mat_cusparse_use_cpu_solve - Do MatSolve on CPU 4341 4342 Level: beginner 4343 4344 .seealso: `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 4345 M*/ 4346 4347 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 4348 4349 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 4350 { 4351 PetscFunctionBegin; 4352 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band)); 4353 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse)); 4354 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse)); 4355 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse)); 4356 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse)); 4357 4358 PetscFunctionReturn(0); 4359 } 4360 4361 static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat) 4362 { 4363 Mat_SeqAIJCUSPARSE* cusp = (Mat_SeqAIJCUSPARSE*)mat->spptr; 4364 4365 PetscFunctionBegin; 4366 if (!cusp) PetscFunctionReturn(0); 4367 delete cusp->cooPerm; 4368 delete cusp->cooPerm_a; 4369 cusp->cooPerm = NULL; 4370 cusp->cooPerm_a = NULL; 4371 if (cusp->use_extended_coo) { 4372 PetscCallCUDA(cudaFree(cusp->jmap_d)); 4373 PetscCallCUDA(cudaFree(cusp->perm_d)); 4374 } 4375 cusp->use_extended_coo = PETSC_FALSE; 4376 PetscFunctionReturn(0); 4377 } 4378 4379 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 4380 { 4381 PetscFunctionBegin; 4382 if (*cusparsestruct) { 4383 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format)); 4384 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format)); 4385 delete (*cusparsestruct)->workVector; 4386 delete (*cusparsestruct)->rowoffsets_gpu; 4387 delete (*cusparsestruct)->cooPerm; 4388 delete (*cusparsestruct)->cooPerm_a; 4389 delete (*cusparsestruct)->csr2csc_i; 4390 if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle)); 4391 if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d)); 4392 if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d)); 4393 PetscCall(PetscFree(*cusparsestruct)); 4394 } 4395 PetscFunctionReturn(0); 4396 } 4397 4398 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 4399 { 4400 PetscFunctionBegin; 4401 if (*mat) { 4402 delete (*mat)->values; 4403 delete (*mat)->column_indices; 4404 delete (*mat)->row_offsets; 4405 delete *mat; 4406 *mat = 0; 4407 } 4408 PetscFunctionReturn(0); 4409 } 4410 4411 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 4412 { 4413 PetscFunctionBegin; 4414 if (*trifactor) { 4415 if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr)); 4416 if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo)); 4417 PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 4418 if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer)); 4419 if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h)); 4420 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4421 if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer)); 4422 #endif 4423 PetscCall(PetscFree(*trifactor)); 4424 } 4425 PetscFunctionReturn(0); 4426 } 4427 4428 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 4429 { 4430 CsrMatrix *mat; 4431 4432 PetscFunctionBegin; 4433 if (*matstruct) { 4434 if ((*matstruct)->mat) { 4435 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 4436 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4437 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 4438 #else 4439 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 4440 PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat)); 4441 #endif 4442 } else { 4443 mat = (CsrMatrix*)(*matstruct)->mat; 4444 CsrMatrix_Destroy(&mat); 4445 } 4446 } 4447 if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr)); 4448 delete (*matstruct)->cprowIndices; 4449 if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one)); 4450 if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero)); 4451 if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one)); 4452 4453 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4454 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 4455 if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr)); 4456 for (int i=0; i<3; i++) { 4457 if (mdata->cuSpMV[i].initialized) { 4458 PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer)); 4459 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr)); 4460 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr)); 4461 } 4462 } 4463 #endif 4464 delete *matstruct; 4465 *matstruct = NULL; 4466 } 4467 PetscFunctionReturn(0); 4468 } 4469 4470 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors) 4471 { 4472 Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors; 4473 4474 PetscFunctionBegin; 4475 if (fs) { 4476 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr)); 4477 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr)); 4478 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose)); 4479 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose)); 4480 delete fs->rpermIndices; 4481 delete fs->cpermIndices; 4482 delete fs->workVector; 4483 fs->rpermIndices = NULL; 4484 fs->cpermIndices = NULL; 4485 fs->workVector = NULL; 4486 if (fs->a_band_d) PetscCallCUDA(cudaFree(fs->a_band_d)); 4487 if (fs->i_band_d) PetscCallCUDA(cudaFree(fs->i_band_d)); 4488 fs->init_dev_prop = PETSC_FALSE; 4489 #if CUSPARSE_VERSION >= 11500 4490 PetscCallCUDA(cudaFree(fs->csrRowPtr)); 4491 PetscCallCUDA(cudaFree(fs->csrColIdx)); 4492 PetscCallCUDA(cudaFree(fs->csrVal)); 4493 PetscCallCUDA(cudaFree(fs->X)); 4494 PetscCallCUDA(cudaFree(fs->Y)); 4495 // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */ 4496 PetscCallCUDA(cudaFree(fs->spsvBuffer_L)); 4497 PetscCallCUDA(cudaFree(fs->spsvBuffer_U)); 4498 PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt)); 4499 PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut)); 4500 PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M)); 4501 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L)); 4502 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U)); 4503 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L)); 4504 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt)); 4505 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U)); 4506 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut)); 4507 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X)); 4508 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y)); 4509 PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M)); 4510 PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M)); 4511 4512 fs->createdTransposeSpSVDescr = PETSC_FALSE; 4513 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 4514 #endif 4515 } 4516 PetscFunctionReturn(0); 4517 } 4518 4519 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 4520 { 4521 cusparseHandle_t handle; 4522 4523 PetscFunctionBegin; 4524 if (*trifactors) { 4525 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors)); 4526 if (handle = (*trifactors)->handle) { 4527 PetscCallCUSPARSE(cusparseDestroy(handle)); 4528 } 4529 PetscCall(PetscFree(*trifactors)); 4530 } 4531 PetscFunctionReturn(0); 4532 } 4533 4534 struct IJCompare 4535 { 4536 __host__ __device__ 4537 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 4538 { 4539 if (t1.get<0>() < t2.get<0>()) return true; 4540 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4541 return false; 4542 } 4543 }; 4544 4545 struct IJEqual 4546 { 4547 __host__ __device__ 4548 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 4549 { 4550 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 4551 return true; 4552 } 4553 }; 4554 4555 struct IJDiff 4556 { 4557 __host__ __device__ 4558 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 4559 { 4560 return t1 == t2 ? 0 : 1; 4561 } 4562 }; 4563 4564 struct IJSum 4565 { 4566 __host__ __device__ 4567 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 4568 { 4569 return t1||t2; 4570 } 4571 }; 4572 4573 #include <thrust/iterator/discard_iterator.h> 4574 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */ 4575 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 4576 { 4577 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4578 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4579 THRUSTARRAY *cooPerm_v = NULL; 4580 thrust::device_ptr<const PetscScalar> d_v; 4581 CsrMatrix *matrix; 4582 PetscInt n; 4583 4584 PetscFunctionBegin; 4585 PetscCheck(cusp,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 4586 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 4587 if (!cusp->cooPerm) { 4588 PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY)); 4589 PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY)); 4590 PetscFunctionReturn(0); 4591 } 4592 matrix = (CsrMatrix*)cusp->mat->mat; 4593 PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4594 if (!v) { 4595 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 4596 goto finalize; 4597 } 4598 n = cusp->cooPerm->size(); 4599 if (isCudaMem(v)) { 4600 d_v = thrust::device_pointer_cast(v); 4601 } else { 4602 cooPerm_v = new THRUSTARRAY(n); 4603 cooPerm_v->assign(v,v+n); 4604 d_v = cooPerm_v->data(); 4605 PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar))); 4606 } 4607 PetscCall(PetscLogGpuTimeBegin()); 4608 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 4609 if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 4610 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 4611 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 4612 /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 4613 cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 4614 cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 4615 */ 4616 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 4617 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 4618 delete cooPerm_w; 4619 } else { 4620 /* all nonzeros in d_v[] are unique entries */ 4621 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 4622 matrix->values->begin())); 4623 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 4624 matrix->values->end())); 4625 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 4626 } 4627 } else { 4628 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 4629 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 4630 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 4631 } else { 4632 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 4633 matrix->values->begin())); 4634 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 4635 matrix->values->end())); 4636 thrust::for_each(zibit,zieit,VecCUDAEquals()); 4637 } 4638 } 4639 PetscCall(PetscLogGpuTimeEnd()); 4640 finalize: 4641 delete cooPerm_v; 4642 A->offloadmask = PETSC_OFFLOAD_GPU; 4643 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4644 /* shorter version of MatAssemblyEnd_SeqAIJ */ 4645 PetscCall(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz)); 4646 PetscCall(PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n")); 4647 PetscCall(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax)); 4648 a->reallocs = 0; 4649 A->info.mallocs += 0; 4650 A->info.nz_unneeded = 0; 4651 A->assembled = A->was_assembled = PETSC_TRUE; 4652 A->num_ass++; 4653 PetscFunctionReturn(0); 4654 } 4655 4656 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 4657 { 4658 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4659 4660 PetscFunctionBegin; 4661 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4662 if (!cusp) PetscFunctionReturn(0); 4663 if (destroy) { 4664 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format)); 4665 delete cusp->csr2csc_i; 4666 cusp->csr2csc_i = NULL; 4667 } 4668 A->transupdated = PETSC_FALSE; 4669 PetscFunctionReturn(0); 4670 } 4671 4672 #include <thrust/binary_search.h> 4673 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */ 4674 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[]) 4675 { 4676 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4677 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4678 PetscInt cooPerm_n, nzr = 0; 4679 4680 PetscFunctionBegin; 4681 PetscCall(PetscLayoutSetUp(A->rmap)); 4682 PetscCall(PetscLayoutSetUp(A->cmap)); 4683 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 4684 if (n != cooPerm_n) { 4685 delete cusp->cooPerm; 4686 delete cusp->cooPerm_a; 4687 cusp->cooPerm = NULL; 4688 cusp->cooPerm_a = NULL; 4689 } 4690 if (n) { 4691 THRUSTINTARRAY d_i(n); 4692 THRUSTINTARRAY d_j(n); 4693 THRUSTINTARRAY ii(A->rmap->n); 4694 4695 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 4696 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 4697 4698 PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt))); 4699 d_i.assign(coo_i,coo_i+n); 4700 d_j.assign(coo_j,coo_j+n); 4701 4702 /* Ex. 4703 n = 6 4704 coo_i = [3,3,1,4,1,4] 4705 coo_j = [3,2,2,5,2,6] 4706 */ 4707 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 4708 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 4709 4710 PetscCall(PetscLogGpuTimeBegin()); 4711 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 4712 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 4713 *cusp->cooPerm_a = d_i; /* copy the sorted array */ 4714 THRUSTINTARRAY w = d_j; 4715 4716 /* 4717 d_i = [1,1,3,3,4,4] 4718 d_j = [2,2,2,3,5,6] 4719 cooPerm = [2,4,1,0,3,5] 4720 */ 4721 auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 4722 4723 /* 4724 d_i = [1,3,3,4,4,x] 4725 ^ekey 4726 d_j = [2,2,3,5,6,x] 4727 ^nekye 4728 */ 4729 if (nekey == ekey) { /* all entries are unique */ 4730 delete cusp->cooPerm_a; 4731 cusp->cooPerm_a = NULL; 4732 } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 4733 /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 4734 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 4735 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 4736 (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 4737 w[0] = 0; 4738 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 4739 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 4740 } 4741 thrust::counting_iterator<PetscInt> search_begin(0); 4742 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 4743 search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 4744 ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 4745 PetscCall(PetscLogGpuTimeEnd()); 4746 4747 PetscCall(MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i)); 4748 a->singlemalloc = PETSC_FALSE; 4749 a->free_a = PETSC_TRUE; 4750 a->free_ij = PETSC_TRUE; 4751 PetscCall(PetscMalloc1(A->rmap->n+1,&a->i)); 4752 a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 4753 PetscCallCUDA(cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 4754 a->nz = a->maxnz = a->i[A->rmap->n]; 4755 a->rmax = 0; 4756 PetscCall(PetscMalloc1(a->nz,&a->a)); 4757 PetscCall(PetscMalloc1(a->nz,&a->j)); 4758 PetscCallCUDA(cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 4759 if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n,&a->ilen)); 4760 if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n,&a->imax)); 4761 for (PetscInt i = 0; i < A->rmap->n; i++) { 4762 const PetscInt nnzr = a->i[i+1] - a->i[i]; 4763 nzr += (PetscInt)!!(nnzr); 4764 a->ilen[i] = a->imax[i] = nnzr; 4765 a->rmax = PetscMax(a->rmax,nnzr); 4766 } 4767 a->nonzerorowcnt = nzr; 4768 A->preallocated = PETSC_TRUE; 4769 PetscCall(PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt))); 4770 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 4771 } else { 4772 PetscCall(MatSeqAIJSetPreallocation(A,0,NULL)); 4773 } 4774 PetscCall(MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE)); 4775 4776 /* We want to allocate the CUSPARSE struct for matvec now. 4777 The code is so convoluted now that I prefer to copy zeros */ 4778 PetscCall(PetscArrayzero(a->a,a->nz)); 4779 PetscCall(MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6)); 4780 A->offloadmask = PETSC_OFFLOAD_CPU; 4781 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4782 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 4783 PetscFunctionReturn(0); 4784 } 4785 4786 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 4787 { 4788 Mat_SeqAIJ *seq; 4789 Mat_SeqAIJCUSPARSE *dev; 4790 PetscBool coo_basic = PETSC_TRUE; 4791 PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 4792 4793 PetscFunctionBegin; 4794 PetscCall(MatResetPreallocationCOO_SeqAIJ(mat)); 4795 PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat)); 4796 if (coo_i) { 4797 PetscCall(PetscGetMemType(coo_i,&mtype)); 4798 if (PetscMemTypeHost(mtype)) { 4799 for (PetscCount k=0; k<coo_n; k++) { 4800 if (coo_i[k] < 0 || coo_j[k] < 0) {coo_basic = PETSC_FALSE; break;} 4801 } 4802 } 4803 } 4804 4805 if (coo_basic) { /* i,j are on device or do not contain negative indices */ 4806 PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j)); 4807 } else { 4808 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j)); 4809 mat->offloadmask = PETSC_OFFLOAD_CPU; 4810 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat)); 4811 seq = static_cast<Mat_SeqAIJ*>(mat->data); 4812 dev = static_cast<Mat_SeqAIJCUSPARSE*>(mat->spptr); 4813 PetscCallCUDA(cudaMalloc((void**)&dev->jmap_d,(seq->nz+1)*sizeof(PetscCount))); 4814 PetscCallCUDA(cudaMemcpy(dev->jmap_d,seq->jmap,(seq->nz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice)); 4815 PetscCallCUDA(cudaMalloc((void**)&dev->perm_d,seq->Atot*sizeof(PetscCount))); 4816 PetscCallCUDA(cudaMemcpy(dev->perm_d,seq->perm,seq->Atot*sizeof(PetscCount),cudaMemcpyHostToDevice)); 4817 dev->use_extended_coo = PETSC_TRUE; 4818 } 4819 PetscFunctionReturn(0); 4820 } 4821 4822 __global__ static void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount jmap[],const PetscCount perm[],InsertMode imode,PetscScalar a[]) 4823 { 4824 PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 4825 const PetscCount grid_size = gridDim.x * blockDim.x; 4826 for (; i<nnz; i+= grid_size) { 4827 PetscScalar sum = 0.0; 4828 for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) sum += kv[perm[k]]; 4829 a[i] = (imode == INSERT_VALUES? 0.0 : a[i]) + sum; 4830 } 4831 } 4832 4833 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 4834 { 4835 Mat_SeqAIJ *seq = (Mat_SeqAIJ*)A->data; 4836 Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE*)A->spptr; 4837 PetscCount Annz = seq->nz; 4838 PetscMemType memtype; 4839 const PetscScalar *v1 = v; 4840 PetscScalar *Aa; 4841 4842 PetscFunctionBegin; 4843 if (dev->use_extended_coo) { 4844 PetscCall(PetscGetMemType(v,&memtype)); 4845 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 4846 PetscCallCUDA(cudaMalloc((void**)&v1,seq->coo_n*sizeof(PetscScalar))); 4847 PetscCallCUDA(cudaMemcpy((void*)v1,v,seq->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4848 } 4849 4850 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa)); 4851 else PetscCall(MatSeqAIJCUSPARSEGetArray(A,&Aa)); 4852 4853 if (Annz) { 4854 MatAddCOOValues<<<(Annz+255)/256,256>>>(v1,Annz,dev->jmap_d,dev->perm_d,imode,Aa); 4855 PetscCallCUDA(cudaPeekAtLastError()); 4856 } 4857 4858 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa)); 4859 else PetscCall(MatSeqAIJCUSPARSERestoreArray(A,&Aa)); 4860 4861 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void*)v1)); 4862 } else { 4863 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A,v,imode)); 4864 } 4865 PetscFunctionReturn(0); 4866 } 4867 4868 /*@C 4869 MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices. 4870 4871 Not collective 4872 4873 Input Parameters: 4874 + A - the matrix 4875 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4876 4877 Output Parameters: 4878 + ia - the CSR row pointers 4879 - ja - the CSR column indices 4880 4881 Level: developer 4882 4883 Notes: 4884 When compressed is true, the CSR structure does not contain empty rows 4885 4886 .seealso: `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()` 4887 @*/ 4888 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j) 4889 { 4890 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4891 CsrMatrix *csr; 4892 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4893 4894 PetscFunctionBegin; 4895 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4896 if (!i || !j) PetscFunctionReturn(0); 4897 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4898 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4899 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4900 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4901 csr = (CsrMatrix*)cusp->mat->mat; 4902 if (i) { 4903 if (!compressed && a->compressedrow.use) { /* need full row offset */ 4904 if (!cusp->rowoffsets_gpu) { 4905 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4906 cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4907 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 4908 } 4909 *i = cusp->rowoffsets_gpu->data().get(); 4910 } else *i = csr->row_offsets->data().get(); 4911 } 4912 if (j) *j = csr->column_indices->data().get(); 4913 PetscFunctionReturn(0); 4914 } 4915 4916 /*@C 4917 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ() 4918 4919 Not collective 4920 4921 Input Parameters: 4922 + A - the matrix 4923 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4924 4925 Output Parameters: 4926 + ia - the CSR row pointers 4927 - ja - the CSR column indices 4928 4929 Level: developer 4930 4931 .seealso: `MatSeqAIJCUSPARSEGetIJ()` 4932 @*/ 4933 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j) 4934 { 4935 PetscFunctionBegin; 4936 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4937 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4938 if (i) *i = NULL; 4939 if (j) *j = NULL; 4940 PetscFunctionReturn(0); 4941 } 4942 4943 /*@C 4944 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4945 4946 Not Collective 4947 4948 Input Parameter: 4949 . A - a MATSEQAIJCUSPARSE matrix 4950 4951 Output Parameter: 4952 . a - pointer to the device data 4953 4954 Level: developer 4955 4956 Notes: may trigger host-device copies if up-to-date matrix data is on host 4957 4958 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()` 4959 @*/ 4960 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 4961 { 4962 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4963 CsrMatrix *csr; 4964 4965 PetscFunctionBegin; 4966 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4967 PetscValidPointer(a,2); 4968 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4969 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4970 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4971 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4972 csr = (CsrMatrix*)cusp->mat->mat; 4973 PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4974 *a = csr->values->data().get(); 4975 PetscFunctionReturn(0); 4976 } 4977 4978 /*@C 4979 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead() 4980 4981 Not Collective 4982 4983 Input Parameter: 4984 . A - a MATSEQAIJCUSPARSE matrix 4985 4986 Output Parameter: 4987 . a - pointer to the device data 4988 4989 Level: developer 4990 4991 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()` 4992 @*/ 4993 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 4994 { 4995 PetscFunctionBegin; 4996 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4997 PetscValidPointer(a,2); 4998 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4999 *a = NULL; 5000 PetscFunctionReturn(0); 5001 } 5002 5003 /*@C 5004 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 5005 5006 Not Collective 5007 5008 Input Parameter: 5009 . A - a MATSEQAIJCUSPARSE matrix 5010 5011 Output Parameter: 5012 . a - pointer to the device data 5013 5014 Level: developer 5015 5016 Notes: may trigger host-device copies if up-to-date matrix data is on host 5017 5018 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()` 5019 @*/ 5020 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 5021 { 5022 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 5023 CsrMatrix *csr; 5024 5025 PetscFunctionBegin; 5026 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 5027 PetscValidPointer(a,2); 5028 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 5029 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 5030 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 5031 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 5032 csr = (CsrMatrix*)cusp->mat->mat; 5033 PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 5034 *a = csr->values->data().get(); 5035 A->offloadmask = PETSC_OFFLOAD_GPU; 5036 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 5037 PetscFunctionReturn(0); 5038 } 5039 /*@C 5040 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray() 5041 5042 Not Collective 5043 5044 Input Parameter: 5045 . A - a MATSEQAIJCUSPARSE matrix 5046 5047 Output Parameter: 5048 . a - pointer to the device data 5049 5050 Level: developer 5051 5052 .seealso: `MatSeqAIJCUSPARSEGetArray()` 5053 @*/ 5054 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 5055 { 5056 PetscFunctionBegin; 5057 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 5058 PetscValidPointer(a,2); 5059 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 5060 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 5061 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 5062 *a = NULL; 5063 PetscFunctionReturn(0); 5064 } 5065 5066 /*@C 5067 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 5068 5069 Not Collective 5070 5071 Input Parameter: 5072 . A - a MATSEQAIJCUSPARSE matrix 5073 5074 Output Parameter: 5075 . a - pointer to the device data 5076 5077 Level: developer 5078 5079 Notes: does not trigger host-device copies and flags data validity on the GPU 5080 5081 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()` 5082 @*/ 5083 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 5084 { 5085 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 5086 CsrMatrix *csr; 5087 5088 PetscFunctionBegin; 5089 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 5090 PetscValidPointer(a,2); 5091 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 5092 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 5093 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 5094 csr = (CsrMatrix*)cusp->mat->mat; 5095 PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 5096 *a = csr->values->data().get(); 5097 A->offloadmask = PETSC_OFFLOAD_GPU; 5098 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 5099 PetscFunctionReturn(0); 5100 } 5101 5102 /*@C 5103 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite() 5104 5105 Not Collective 5106 5107 Input Parameter: 5108 . A - a MATSEQAIJCUSPARSE matrix 5109 5110 Output Parameter: 5111 . a - pointer to the device data 5112 5113 Level: developer 5114 5115 .seealso: `MatSeqAIJCUSPARSEGetArrayWrite()` 5116 @*/ 5117 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 5118 { 5119 PetscFunctionBegin; 5120 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 5121 PetscValidPointer(a,2); 5122 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 5123 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 5124 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 5125 *a = NULL; 5126 PetscFunctionReturn(0); 5127 } 5128 5129 struct IJCompare4 5130 { 5131 __host__ __device__ 5132 inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 5133 { 5134 if (t1.get<0>() < t2.get<0>()) return true; 5135 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 5136 return false; 5137 } 5138 }; 5139 5140 struct Shift 5141 { 5142 int _shift; 5143 5144 Shift(int shift) : _shift(shift) {} 5145 __host__ __device__ 5146 inline int operator() (const int &c) 5147 { 5148 return c + _shift; 5149 } 5150 }; 5151 5152 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 5153 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 5154 { 5155 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 5156 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 5157 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 5158 CsrMatrix *Acsr,*Bcsr,*Ccsr; 5159 PetscInt Annz,Bnnz; 5160 cusparseStatus_t stat; 5161 PetscInt i,m,n,zero = 0; 5162 5163 PetscFunctionBegin; 5164 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 5165 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 5166 PetscValidPointer(C,4); 5167 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 5168 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 5169 PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 5170 PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 5171 PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 5172 PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 5173 if (reuse == MAT_INITIAL_MATRIX) { 5174 m = A->rmap->n; 5175 n = A->cmap->n + B->cmap->n; 5176 PetscCall(MatCreate(PETSC_COMM_SELF,C)); 5177 PetscCall(MatSetSizes(*C,m,n,m,n)); 5178 PetscCall(MatSetType(*C,MATSEQAIJCUSPARSE)); 5179 c = (Mat_SeqAIJ*)(*C)->data; 5180 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 5181 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 5182 Ccsr = new CsrMatrix; 5183 Cmat->cprowIndices = NULL; 5184 c->compressedrow.use = PETSC_FALSE; 5185 c->compressedrow.nrows = 0; 5186 c->compressedrow.i = NULL; 5187 c->compressedrow.rindex = NULL; 5188 Ccusp->workVector = NULL; 5189 Ccusp->nrows = m; 5190 Ccusp->mat = Cmat; 5191 Ccusp->mat->mat = Ccsr; 5192 Ccsr->num_rows = m; 5193 Ccsr->num_cols = n; 5194 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 5195 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 5196 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 5197 PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar))); 5198 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar))); 5199 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 5200 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 5201 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 5202 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 5203 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 5204 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 5205 PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 5206 PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 5207 5208 Acsr = (CsrMatrix*)Acusp->mat->mat; 5209 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 5210 Annz = (PetscInt)Acsr->column_indices->size(); 5211 Bnnz = (PetscInt)Bcsr->column_indices->size(); 5212 c->nz = Annz + Bnnz; 5213 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 5214 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 5215 Ccsr->values = new THRUSTARRAY(c->nz); 5216 Ccsr->num_entries = c->nz; 5217 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 5218 if (c->nz) { 5219 auto Acoo = new THRUSTINTARRAY32(Annz); 5220 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 5221 auto Ccoo = new THRUSTINTARRAY32(c->nz); 5222 THRUSTINTARRAY32 *Aroff,*Broff; 5223 5224 if (a->compressedrow.use) { /* need full row offset */ 5225 if (!Acusp->rowoffsets_gpu) { 5226 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 5227 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 5228 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 5229 } 5230 Aroff = Acusp->rowoffsets_gpu; 5231 } else Aroff = Acsr->row_offsets; 5232 if (b->compressedrow.use) { /* need full row offset */ 5233 if (!Bcusp->rowoffsets_gpu) { 5234 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 5235 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 5236 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt))); 5237 } 5238 Broff = Bcusp->rowoffsets_gpu; 5239 } else Broff = Bcsr->row_offsets; 5240 PetscCall(PetscLogGpuTimeBegin()); 5241 stat = cusparseXcsr2coo(Acusp->handle, 5242 Aroff->data().get(), 5243 Annz, 5244 m, 5245 Acoo->data().get(), 5246 CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat); 5247 stat = cusparseXcsr2coo(Bcusp->handle, 5248 Broff->data().get(), 5249 Bnnz, 5250 m, 5251 Bcoo->data().get(), 5252 CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat); 5253 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 5254 auto Aperm = thrust::make_constant_iterator(1); 5255 auto Bperm = thrust::make_constant_iterator(0); 5256 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 5257 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 5258 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 5259 #else 5260 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 5261 auto Bcib = Bcsr->column_indices->begin(); 5262 auto Bcie = Bcsr->column_indices->end(); 5263 thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 5264 #endif 5265 auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 5266 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 5267 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 5268 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 5269 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 5270 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 5271 auto p1 = Ccusp->cooPerm->begin(); 5272 auto p2 = Ccusp->cooPerm->begin(); 5273 thrust::advance(p2,Annz); 5274 PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 5275 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 5276 thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 5277 #endif 5278 auto cci = thrust::make_counting_iterator(zero); 5279 auto cce = thrust::make_counting_iterator(c->nz); 5280 #if 0 //Errors on SUMMIT cuda 11.1.0 5281 PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 5282 #else 5283 auto pred = thrust::identity<int>(); 5284 PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 5285 PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 5286 #endif 5287 stat = cusparseXcoo2csr(Ccusp->handle, 5288 Ccoo->data().get(), 5289 c->nz, 5290 m, 5291 Ccsr->row_offsets->data().get(), 5292 CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat); 5293 PetscCall(PetscLogGpuTimeEnd()); 5294 delete wPerm; 5295 delete Acoo; 5296 delete Bcoo; 5297 delete Ccoo; 5298 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 5299 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 5300 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 5301 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 5302 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 5303 #endif 5304 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 5305 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 5306 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 5307 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 5308 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 5309 CsrMatrix *CcsrT = new CsrMatrix; 5310 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 5311 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 5312 5313 (*C)->form_explicit_transpose = PETSC_TRUE; 5314 (*C)->transupdated = PETSC_TRUE; 5315 Ccusp->rowoffsets_gpu = NULL; 5316 CmatT->cprowIndices = NULL; 5317 CmatT->mat = CcsrT; 5318 CcsrT->num_rows = n; 5319 CcsrT->num_cols = m; 5320 CcsrT->num_entries = c->nz; 5321 5322 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 5323 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 5324 CcsrT->values = new THRUSTARRAY(c->nz); 5325 5326 PetscCall(PetscLogGpuTimeBegin()); 5327 auto rT = CcsrT->row_offsets->begin(); 5328 if (AT) { 5329 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 5330 thrust::advance(rT,-1); 5331 } 5332 if (BT) { 5333 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 5334 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 5335 thrust::copy(titb,tite,rT); 5336 } 5337 auto cT = CcsrT->column_indices->begin(); 5338 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 5339 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 5340 auto vT = CcsrT->values->begin(); 5341 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 5342 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 5343 PetscCall(PetscLogGpuTimeEnd()); 5344 5345 PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr)); 5346 PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO)); 5347 PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 5348 PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar))); 5349 PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar))); 5350 PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar))); 5351 PetscCallCUDA(cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 5352 PetscCallCUDA(cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 5353 PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 5354 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 5355 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 5356 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 5357 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 5358 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 5359 #endif 5360 Ccusp->matTranspose = CmatT; 5361 } 5362 } 5363 5364 c->singlemalloc = PETSC_FALSE; 5365 c->free_a = PETSC_TRUE; 5366 c->free_ij = PETSC_TRUE; 5367 PetscCall(PetscMalloc1(m+1,&c->i)); 5368 PetscCall(PetscMalloc1(c->nz,&c->j)); 5369 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 5370 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 5371 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 5372 ii = *Ccsr->row_offsets; 5373 jj = *Ccsr->column_indices; 5374 PetscCallCUDA(cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 5375 PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 5376 } else { 5377 PetscCallCUDA(cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 5378 PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 5379 } 5380 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt))); 5381 PetscCall(PetscMalloc1(m,&c->ilen)); 5382 PetscCall(PetscMalloc1(m,&c->imax)); 5383 c->maxnz = c->nz; 5384 c->nonzerorowcnt = 0; 5385 c->rmax = 0; 5386 for (i = 0; i < m; i++) { 5387 const PetscInt nn = c->i[i+1] - c->i[i]; 5388 c->ilen[i] = c->imax[i] = nn; 5389 c->nonzerorowcnt += (PetscInt)!!nn; 5390 c->rmax = PetscMax(c->rmax,nn); 5391 } 5392 PetscCall(MatMarkDiagonal_SeqAIJ(*C)); 5393 PetscCall(PetscMalloc1(c->nz,&c->a)); 5394 (*C)->nonzerostate++; 5395 PetscCall(PetscLayoutSetUp((*C)->rmap)); 5396 PetscCall(PetscLayoutSetUp((*C)->cmap)); 5397 Ccusp->nonzerostate = (*C)->nonzerostate; 5398 (*C)->preallocated = PETSC_TRUE; 5399 } else { 5400 PetscCheck((*C)->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n); 5401 c = (Mat_SeqAIJ*)(*C)->data; 5402 if (c->nz) { 5403 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 5404 PetscCheck(Ccusp->cooPerm,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 5405 PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 5406 PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate,PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 5407 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 5408 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 5409 PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 5410 PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 5411 Acsr = (CsrMatrix*)Acusp->mat->mat; 5412 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 5413 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 5414 PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size()); 5415 PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 5416 PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 5417 PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries,PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 5418 PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 5419 auto pmid = Ccusp->cooPerm->begin(); 5420 thrust::advance(pmid,Acsr->num_entries); 5421 PetscCall(PetscLogGpuTimeBegin()); 5422 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 5423 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 5424 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 5425 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 5426 thrust::for_each(zibait,zieait,VecCUDAEquals()); 5427 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 5428 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 5429 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 5430 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 5431 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 5432 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE)); 5433 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 5434 PetscCheck(Ccusp->matTranspose,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 5435 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 5436 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 5437 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 5438 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 5439 auto vT = CcsrT->values->begin(); 5440 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 5441 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 5442 (*C)->transupdated = PETSC_TRUE; 5443 } 5444 PetscCall(PetscLogGpuTimeEnd()); 5445 } 5446 } 5447 PetscCall(PetscObjectStateIncrease((PetscObject)*C)); 5448 (*C)->assembled = PETSC_TRUE; 5449 (*C)->was_assembled = PETSC_FALSE; 5450 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 5451 PetscFunctionReturn(0); 5452 } 5453 5454 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 5455 { 5456 bool dmem; 5457 const PetscScalar *av; 5458 5459 PetscFunctionBegin; 5460 dmem = isCudaMem(v); 5461 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A,&av)); 5462 if (n && idx) { 5463 THRUSTINTARRAY widx(n); 5464 widx.assign(idx,idx+n); 5465 PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt))); 5466 5467 THRUSTARRAY *w = NULL; 5468 thrust::device_ptr<PetscScalar> dv; 5469 if (dmem) { 5470 dv = thrust::device_pointer_cast(v); 5471 } else { 5472 w = new THRUSTARRAY(n); 5473 dv = w->data(); 5474 } 5475 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 5476 5477 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 5478 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 5479 thrust::for_each(zibit,zieit,VecCUDAEquals()); 5480 if (w) { 5481 PetscCallCUDA(cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost)); 5482 } 5483 delete w; 5484 } else { 5485 PetscCallCUDA(cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost)); 5486 } 5487 if (!dmem) PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar))); 5488 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A,&av)); 5489 PetscFunctionReturn(0); 5490 } 5491