1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 7 8 #include <petscconf.h> 9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10 #include <../src/mat/impls/sbaij/seq/sbaij.h> 11 #include <../src/vec/vec/impls/dvecimpl.h> 12 #include <petsc/private/vecimpl.h> 13 #undef VecType 14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15 #include <thrust/adjacent_difference.h> 16 #include <thrust/async/for_each.h> 17 #include <thrust/iterator/constant_iterator.h> 18 #include <thrust/remove.h> 19 #include <thrust/sort.h> 20 #include <thrust/unique.h> 21 22 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 23 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 24 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 25 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 26 27 typedef enum { 28 CUSPARSE_MV_ALG_DEFAULT = 0, 29 CUSPARSE_COOMV_ALG = 1, 30 CUSPARSE_CSRMV_ALG1 = 2, 31 CUSPARSE_CSRMV_ALG2 = 3 32 } cusparseSpMVAlg_t; 33 34 typedef enum { 35 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 36 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 37 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 38 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 39 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 40 CUSPARSE_SPMM_ALG_DEFAULT = 0, 41 CUSPARSE_SPMM_COO_ALG1 = 1, 42 CUSPARSE_SPMM_COO_ALG2 = 2, 43 CUSPARSE_SPMM_COO_ALG3 = 3, 44 CUSPARSE_SPMM_COO_ALG4 = 5, 45 CUSPARSE_SPMM_CSR_ALG1 = 4, 46 CUSPARSE_SPMM_CSR_ALG2 = 6, 47 } cusparseSpMMAlg_t; 48 49 typedef enum { 50 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 51 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 52 } cusparseCsr2CscAlg_t; 53 */ 54 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 55 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 56 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 57 #endif 58 59 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 60 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 61 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 62 63 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 64 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 65 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 66 67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 71 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 72 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 73 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 74 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 75 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 76 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 77 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 78 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 79 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 80 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 81 82 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 83 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 84 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 85 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 86 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 87 88 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 89 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 90 91 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 92 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscCount,const PetscInt[],const PetscInt[]); 93 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 94 95 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 96 { 97 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 98 99 PetscFunctionBegin; 100 switch (op) { 101 case MAT_CUSPARSE_MULT: 102 cusparsestruct->format = format; 103 break; 104 case MAT_CUSPARSE_ALL: 105 cusparsestruct->format = format; 106 break; 107 default: 108 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 109 } 110 PetscFunctionReturn(0); 111 } 112 113 /*@ 114 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 115 operation. Only the MatMult operation can use different GPU storage formats 116 for MPIAIJCUSPARSE matrices. 117 Not Collective 118 119 Input Parameters: 120 + A - Matrix of type SEQAIJCUSPARSE 121 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 122 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 123 124 Output Parameter: 125 126 Level: intermediate 127 128 .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 129 @*/ 130 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 131 { 132 PetscFunctionBegin; 133 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 134 PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format)); 135 PetscFunctionReturn(0); 136 } 137 138 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu) 139 { 140 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 141 142 PetscFunctionBegin; 143 cusparsestruct->use_cpu_solve = use_cpu; 144 PetscFunctionReturn(0); 145 } 146 147 /*@ 148 MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve. 149 150 Input Parameters: 151 + A - Matrix of type SEQAIJCUSPARSE 152 - use_cpu - set flag for using the built-in CPU MatSolve 153 154 Output Parameter: 155 156 Notes: 157 The cuSparse LU solver currently computes the factors with the built-in CPU method 158 and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 159 This method to specify if the solve is done on the CPU or GPU (GPU is the default). 160 161 Level: intermediate 162 163 .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 164 @*/ 165 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu) 166 { 167 PetscFunctionBegin; 168 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 169 PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu)); 170 PetscFunctionReturn(0); 171 } 172 173 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 174 { 175 PetscFunctionBegin; 176 switch (op) { 177 case MAT_FORM_EXPLICIT_TRANSPOSE: 178 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 179 if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 180 A->form_explicit_transpose = flg; 181 break; 182 default: 183 PetscCall(MatSetOption_SeqAIJ(A,op,flg)); 184 break; 185 } 186 PetscFunctionReturn(0); 187 } 188 189 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 190 191 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 192 { 193 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 194 IS isrow = b->row,iscol = b->col; 195 PetscBool row_identity,col_identity; 196 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr; 197 198 PetscFunctionBegin; 199 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 200 PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info)); 201 B->offloadmask = PETSC_OFFLOAD_CPU; 202 /* determine which version of MatSolve needs to be used. */ 203 PetscCall(ISIdentity(isrow,&row_identity)); 204 PetscCall(ISIdentity(iscol,&col_identity)); 205 206 if (!cusparsestruct->use_cpu_solve) { 207 if (row_identity && col_identity) { 208 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 209 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 210 } else { 211 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 212 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 213 } 214 } 215 B->ops->matsolve = NULL; 216 B->ops->matsolvetranspose = NULL; 217 218 /* get the triangular factors */ 219 if (!cusparsestruct->use_cpu_solve) { 220 PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B)); 221 } 222 PetscFunctionReturn(0); 223 } 224 225 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 226 { 227 MatCUSPARSEStorageFormat format; 228 PetscBool flg; 229 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 230 231 PetscFunctionBegin; 232 PetscOptionsHeadBegin(PetscOptionsObject,"SeqAIJCUSPARSE options"); 233 if (A->factortype == MAT_FACTOR_NONE) { 234 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 235 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg)); 236 if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format)); 237 238 PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 239 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg)); 240 if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format)); 241 PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg)); 242 if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve)); 243 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 244 PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 245 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg)); 246 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 247 #if CUSPARSE_VERSION > 11301 248 PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 249 #else 250 PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 251 #endif 252 PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 253 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg)); 254 PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 255 256 PetscCall(PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 257 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg)); 258 PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 259 #endif 260 } 261 PetscOptionsHeadEnd(); 262 PetscFunctionReturn(0); 263 } 264 265 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 266 { 267 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 268 PetscInt n = A->rmap->n; 269 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 270 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 271 const PetscInt *ai = a->i,*aj = a->j,*vi; 272 const MatScalar *aa = a->a,*v; 273 PetscInt *AiLo, *AjLo; 274 PetscInt i,nz, nzLower, offset, rowOffset; 275 276 PetscFunctionBegin; 277 if (!n) PetscFunctionReturn(0); 278 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 279 try { 280 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 281 nzLower=n+ai[n]-ai[1]; 282 if (!loTriFactor) { 283 PetscScalar *AALo; 284 285 PetscCallCUDA(cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar))); 286 287 /* Allocate Space for the lower triangular matrix */ 288 PetscCallCUDA(cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt))); 289 PetscCallCUDA(cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt))); 290 291 /* Fill the lower triangular matrix */ 292 AiLo[0] = (PetscInt) 0; 293 AiLo[n] = nzLower; 294 AjLo[0] = (PetscInt) 0; 295 AALo[0] = (MatScalar) 1.0; 296 v = aa; 297 vi = aj; 298 offset = 1; 299 rowOffset= 1; 300 for (i=1; i<n; i++) { 301 nz = ai[i+1] - ai[i]; 302 /* additional 1 for the term on the diagonal */ 303 AiLo[i] = rowOffset; 304 rowOffset += nz+1; 305 306 PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz)); 307 PetscCall(PetscArraycpy(&(AALo[offset]), v, nz)); 308 309 offset += nz; 310 AjLo[offset] = (PetscInt) i; 311 AALo[offset] = (MatScalar) 1.0; 312 offset += 1; 313 314 v += nz; 315 vi += nz; 316 } 317 318 /* allocate space for the triangular factor information */ 319 PetscCall(PetscNew(&loTriFactor)); 320 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 321 /* Create the matrix description */ 322 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 323 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 324 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 325 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 326 #else 327 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 328 #endif 329 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER)); 330 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 331 332 /* set the operation */ 333 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 334 335 /* set the matrix */ 336 loTriFactor->csrMat = new CsrMatrix; 337 loTriFactor->csrMat->num_rows = n; 338 loTriFactor->csrMat->num_cols = n; 339 loTriFactor->csrMat->num_entries = nzLower; 340 341 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 342 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 343 344 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 345 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 346 347 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 348 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 349 350 /* Create the solve analysis information */ 351 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 352 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 353 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 354 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 355 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 356 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 357 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 358 &loTriFactor->solveBufferSize)); 359 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize)); 360 #endif 361 362 /* perform the solve analysis */ 363 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 364 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 365 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 366 loTriFactor->csrMat->column_indices->data().get(), 367 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 368 loTriFactor->solveInfo, 369 loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 370 #else 371 loTriFactor->solveInfo)); 372 #endif 373 PetscCallCUDA(WaitForCUDA()); 374 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 375 376 /* assign the pointer */ 377 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 378 loTriFactor->AA_h = AALo; 379 PetscCallCUDA(cudaFreeHost(AiLo)); 380 PetscCallCUDA(cudaFreeHost(AjLo)); 381 PetscCall(PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar))); 382 } else { /* update values only */ 383 if (!loTriFactor->AA_h) { 384 PetscCallCUDA(cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar))); 385 } 386 /* Fill the lower triangular matrix */ 387 loTriFactor->AA_h[0] = 1.0; 388 v = aa; 389 vi = aj; 390 offset = 1; 391 for (i=1; i<n; i++) { 392 nz = ai[i+1] - ai[i]; 393 PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz)); 394 offset += nz; 395 loTriFactor->AA_h[offset] = 1.0; 396 offset += 1; 397 v += nz; 398 } 399 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 400 PetscCall(PetscLogCpuToGpu(nzLower*sizeof(PetscScalar))); 401 } 402 } catch(char *ex) { 403 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 404 } 405 } 406 PetscFunctionReturn(0); 407 } 408 409 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 410 { 411 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 412 PetscInt n = A->rmap->n; 413 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 414 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 415 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 416 const MatScalar *aa = a->a,*v; 417 PetscInt *AiUp, *AjUp; 418 PetscInt i,nz, nzUpper, offset; 419 420 PetscFunctionBegin; 421 if (!n) PetscFunctionReturn(0); 422 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 423 try { 424 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 425 nzUpper = adiag[0]-adiag[n]; 426 if (!upTriFactor) { 427 PetscScalar *AAUp; 428 429 PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar))); 430 431 /* Allocate Space for the upper triangular matrix */ 432 PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt))); 433 PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt))); 434 435 /* Fill the upper triangular matrix */ 436 AiUp[0]=(PetscInt) 0; 437 AiUp[n]=nzUpper; 438 offset = nzUpper; 439 for (i=n-1; i>=0; i--) { 440 v = aa + adiag[i+1] + 1; 441 vi = aj + adiag[i+1] + 1; 442 443 /* number of elements NOT on the diagonal */ 444 nz = adiag[i] - adiag[i+1]-1; 445 446 /* decrement the offset */ 447 offset -= (nz+1); 448 449 /* first, set the diagonal elements */ 450 AjUp[offset] = (PetscInt) i; 451 AAUp[offset] = (MatScalar)1./v[nz]; 452 AiUp[i] = AiUp[i+1] - (nz+1); 453 454 PetscCall(PetscArraycpy(&(AjUp[offset+1]), vi, nz)); 455 PetscCall(PetscArraycpy(&(AAUp[offset+1]), v, nz)); 456 } 457 458 /* allocate space for the triangular factor information */ 459 PetscCall(PetscNew(&upTriFactor)); 460 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 461 462 /* Create the matrix description */ 463 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 464 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 465 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 466 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 467 #else 468 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 469 #endif 470 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 471 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 472 473 /* set the operation */ 474 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 475 476 /* set the matrix */ 477 upTriFactor->csrMat = new CsrMatrix; 478 upTriFactor->csrMat->num_rows = n; 479 upTriFactor->csrMat->num_cols = n; 480 upTriFactor->csrMat->num_entries = nzUpper; 481 482 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 483 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 484 485 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 486 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 487 488 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 489 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 490 491 /* Create the solve analysis information */ 492 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 493 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 494 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 495 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 496 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 497 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 498 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 499 &upTriFactor->solveBufferSize)); 500 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize)); 501 #endif 502 503 /* perform the solve analysis */ 504 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 505 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 506 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 507 upTriFactor->csrMat->column_indices->data().get(), 508 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 509 upTriFactor->solveInfo, 510 upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 511 #else 512 upTriFactor->solveInfo)); 513 #endif 514 PetscCallCUDA(WaitForCUDA()); 515 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 516 517 /* assign the pointer */ 518 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 519 upTriFactor->AA_h = AAUp; 520 PetscCallCUDA(cudaFreeHost(AiUp)); 521 PetscCallCUDA(cudaFreeHost(AjUp)); 522 PetscCall(PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar))); 523 } else { 524 if (!upTriFactor->AA_h) { 525 PetscCallCUDA(cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar))); 526 } 527 /* Fill the upper triangular matrix */ 528 offset = nzUpper; 529 for (i=n-1; i>=0; i--) { 530 v = aa + adiag[i+1] + 1; 531 532 /* number of elements NOT on the diagonal */ 533 nz = adiag[i] - adiag[i+1]-1; 534 535 /* decrement the offset */ 536 offset -= (nz+1); 537 538 /* first, set the diagonal elements */ 539 upTriFactor->AA_h[offset] = 1./v[nz]; 540 PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz)); 541 } 542 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 543 PetscCall(PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar))); 544 } 545 } catch(char *ex) { 546 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 547 } 548 } 549 PetscFunctionReturn(0); 550 } 551 552 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 553 { 554 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 555 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 556 IS isrow = a->row,iscol = a->icol; 557 PetscBool row_identity,col_identity; 558 PetscInt n = A->rmap->n; 559 560 PetscFunctionBegin; 561 PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 562 PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A)); 563 PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A)); 564 565 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 566 cusparseTriFactors->nnz=a->nz; 567 568 A->offloadmask = PETSC_OFFLOAD_BOTH; 569 /* lower triangular indices */ 570 PetscCall(ISIdentity(isrow,&row_identity)); 571 if (!row_identity && !cusparseTriFactors->rpermIndices) { 572 const PetscInt *r; 573 574 PetscCall(ISGetIndices(isrow,&r)); 575 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 576 cusparseTriFactors->rpermIndices->assign(r, r+n); 577 PetscCall(ISRestoreIndices(isrow,&r)); 578 PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt))); 579 } 580 581 /* upper triangular indices */ 582 PetscCall(ISIdentity(iscol,&col_identity)); 583 if (!col_identity && !cusparseTriFactors->cpermIndices) { 584 const PetscInt *c; 585 586 PetscCall(ISGetIndices(iscol,&c)); 587 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 588 cusparseTriFactors->cpermIndices->assign(c, c+n); 589 PetscCall(ISRestoreIndices(iscol,&c)); 590 PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt))); 591 } 592 PetscFunctionReturn(0); 593 } 594 595 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 596 { 597 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 598 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 599 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 600 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 601 PetscInt *AiUp, *AjUp; 602 PetscScalar *AAUp; 603 PetscScalar *AALo; 604 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 605 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 606 const PetscInt *ai = b->i,*aj = b->j,*vj; 607 const MatScalar *aa = b->a,*v; 608 609 PetscFunctionBegin; 610 if (!n) PetscFunctionReturn(0); 611 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 612 try { 613 PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar))); 614 PetscCallCUDA(cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar))); 615 if (!upTriFactor && !loTriFactor) { 616 /* Allocate Space for the upper triangular matrix */ 617 PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt))); 618 PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt))); 619 620 /* Fill the upper triangular matrix */ 621 AiUp[0]=(PetscInt) 0; 622 AiUp[n]=nzUpper; 623 offset = 0; 624 for (i=0; i<n; i++) { 625 /* set the pointers */ 626 v = aa + ai[i]; 627 vj = aj + ai[i]; 628 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 629 630 /* first, set the diagonal elements */ 631 AjUp[offset] = (PetscInt) i; 632 AAUp[offset] = (MatScalar)1.0/v[nz]; 633 AiUp[i] = offset; 634 AALo[offset] = (MatScalar)1.0/v[nz]; 635 636 offset+=1; 637 if (nz>0) { 638 PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz)); 639 PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 640 for (j=offset; j<offset+nz; j++) { 641 AAUp[j] = -AAUp[j]; 642 AALo[j] = AAUp[j]/v[nz]; 643 } 644 offset+=nz; 645 } 646 } 647 648 /* allocate space for the triangular factor information */ 649 PetscCall(PetscNew(&upTriFactor)); 650 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 651 652 /* Create the matrix description */ 653 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 654 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 655 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 656 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 657 #else 658 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 659 #endif 660 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 661 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 662 663 /* set the matrix */ 664 upTriFactor->csrMat = new CsrMatrix; 665 upTriFactor->csrMat->num_rows = A->rmap->n; 666 upTriFactor->csrMat->num_cols = A->cmap->n; 667 upTriFactor->csrMat->num_entries = a->nz; 668 669 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 670 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 671 672 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 673 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 674 675 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 676 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 677 678 /* set the operation */ 679 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 680 681 /* Create the solve analysis information */ 682 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 683 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 684 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 685 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 686 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 687 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 688 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 689 &upTriFactor->solveBufferSize)); 690 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize)); 691 #endif 692 693 /* perform the solve analysis */ 694 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 695 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 696 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 697 upTriFactor->csrMat->column_indices->data().get(), 698 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 699 upTriFactor->solveInfo, 700 upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 701 #else 702 upTriFactor->solveInfo)); 703 #endif 704 PetscCallCUDA(WaitForCUDA()); 705 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 706 707 /* assign the pointer */ 708 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 709 710 /* allocate space for the triangular factor information */ 711 PetscCall(PetscNew(&loTriFactor)); 712 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 713 714 /* Create the matrix description */ 715 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 716 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 717 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 718 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 719 #else 720 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 721 #endif 722 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 723 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 724 725 /* set the operation */ 726 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 727 728 /* set the matrix */ 729 loTriFactor->csrMat = new CsrMatrix; 730 loTriFactor->csrMat->num_rows = A->rmap->n; 731 loTriFactor->csrMat->num_cols = A->cmap->n; 732 loTriFactor->csrMat->num_entries = a->nz; 733 734 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 735 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 736 737 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 738 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 739 740 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 741 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 742 743 /* Create the solve analysis information */ 744 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 745 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 746 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 747 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 748 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 749 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 750 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 751 &loTriFactor->solveBufferSize)); 752 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize)); 753 #endif 754 755 /* perform the solve analysis */ 756 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 757 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 758 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 759 loTriFactor->csrMat->column_indices->data().get(), 760 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 761 loTriFactor->solveInfo, 762 loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 763 #else 764 loTriFactor->solveInfo)); 765 #endif 766 PetscCallCUDA(WaitForCUDA()); 767 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 768 769 /* assign the pointer */ 770 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 771 772 PetscCall(PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))); 773 PetscCallCUDA(cudaFreeHost(AiUp)); 774 PetscCallCUDA(cudaFreeHost(AjUp)); 775 } else { 776 /* Fill the upper triangular matrix */ 777 offset = 0; 778 for (i=0; i<n; i++) { 779 /* set the pointers */ 780 v = aa + ai[i]; 781 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 782 783 /* first, set the diagonal elements */ 784 AAUp[offset] = 1.0/v[nz]; 785 AALo[offset] = 1.0/v[nz]; 786 787 offset+=1; 788 if (nz>0) { 789 PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 790 for (j=offset; j<offset+nz; j++) { 791 AAUp[j] = -AAUp[j]; 792 AALo[j] = AAUp[j]/v[nz]; 793 } 794 offset+=nz; 795 } 796 } 797 PetscCheck(upTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 798 PetscCheck(loTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 799 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 800 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 801 PetscCall(PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar))); 802 } 803 PetscCallCUDA(cudaFreeHost(AAUp)); 804 PetscCallCUDA(cudaFreeHost(AALo)); 805 } catch(char *ex) { 806 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 807 } 808 } 809 PetscFunctionReturn(0); 810 } 811 812 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 813 { 814 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 815 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 816 IS ip = a->row; 817 PetscBool perm_identity; 818 PetscInt n = A->rmap->n; 819 820 PetscFunctionBegin; 821 PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 822 PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A)); 823 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 824 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 825 826 A->offloadmask = PETSC_OFFLOAD_BOTH; 827 828 /* lower triangular indices */ 829 PetscCall(ISIdentity(ip,&perm_identity)); 830 if (!perm_identity) { 831 IS iip; 832 const PetscInt *irip,*rip; 833 834 PetscCall(ISInvertPermutation(ip,PETSC_DECIDE,&iip)); 835 PetscCall(ISGetIndices(iip,&irip)); 836 PetscCall(ISGetIndices(ip,&rip)); 837 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 838 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 839 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 840 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 841 PetscCall(ISRestoreIndices(iip,&irip)); 842 PetscCall(ISDestroy(&iip)); 843 PetscCall(ISRestoreIndices(ip,&rip)); 844 PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt))); 845 } 846 PetscFunctionReturn(0); 847 } 848 849 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 850 { 851 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 852 IS ip = b->row; 853 PetscBool perm_identity; 854 855 PetscFunctionBegin; 856 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 857 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B,A,info)); 858 B->offloadmask = PETSC_OFFLOAD_CPU; 859 /* determine which version of MatSolve needs to be used. */ 860 PetscCall(ISIdentity(ip,&perm_identity)); 861 if (perm_identity) { 862 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 863 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 864 B->ops->matsolve = NULL; 865 B->ops->matsolvetranspose = NULL; 866 } else { 867 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 868 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 869 B->ops->matsolve = NULL; 870 B->ops->matsolvetranspose = NULL; 871 } 872 873 /* get the triangular factors */ 874 PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B)); 875 PetscFunctionReturn(0); 876 } 877 878 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 879 { 880 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 881 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 882 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 883 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 884 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 885 cusparseIndexBase_t indexBase; 886 cusparseMatrixType_t matrixType; 887 cusparseFillMode_t fillMode; 888 cusparseDiagType_t diagType; 889 890 PetscFunctionBegin; 891 /* allocate space for the transpose of the lower triangular factor */ 892 PetscCall(PetscNew(&loTriFactorT)); 893 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 894 895 /* set the matrix descriptors of the lower triangular factor */ 896 matrixType = cusparseGetMatType(loTriFactor->descr); 897 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 898 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 899 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 900 diagType = cusparseGetMatDiagType(loTriFactor->descr); 901 902 /* Create the matrix description */ 903 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr)); 904 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 905 PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType)); 906 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode)); 907 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType)); 908 909 /* set the operation */ 910 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 911 912 /* allocate GPU space for the CSC of the lower triangular factor*/ 913 loTriFactorT->csrMat = new CsrMatrix; 914 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 915 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 916 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 917 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 918 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 919 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 920 921 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 922 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 923 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 924 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 925 loTriFactor->csrMat->values->data().get(), 926 loTriFactor->csrMat->row_offsets->data().get(), 927 loTriFactor->csrMat->column_indices->data().get(), 928 loTriFactorT->csrMat->values->data().get(), 929 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 930 CUSPARSE_ACTION_NUMERIC,indexBase, 931 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 932 PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize)); 933 #endif 934 935 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 936 PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 937 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 938 loTriFactor->csrMat->values->data().get(), 939 loTriFactor->csrMat->row_offsets->data().get(), 940 loTriFactor->csrMat->column_indices->data().get(), 941 loTriFactorT->csrMat->values->data().get(), 942 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 943 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 944 CUSPARSE_ACTION_NUMERIC, indexBase, 945 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer)); 946 #else 947 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 948 CUSPARSE_ACTION_NUMERIC, indexBase)); 949 #endif 950 PetscCallCUDA(WaitForCUDA()); 951 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 952 953 /* Create the solve analysis information */ 954 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 955 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo)); 956 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 957 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 958 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 959 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 960 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 961 &loTriFactorT->solveBufferSize)); 962 PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize)); 963 #endif 964 965 /* perform the solve analysis */ 966 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 967 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 968 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 969 loTriFactorT->csrMat->column_indices->data().get(), 970 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 971 loTriFactorT->solveInfo, 972 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 973 #else 974 loTriFactorT->solveInfo)); 975 #endif 976 PetscCallCUDA(WaitForCUDA()); 977 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 978 979 /* assign the pointer */ 980 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 981 982 /*********************************************/ 983 /* Now the Transpose of the Upper Tri Factor */ 984 /*********************************************/ 985 986 /* allocate space for the transpose of the upper triangular factor */ 987 PetscCall(PetscNew(&upTriFactorT)); 988 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 989 990 /* set the matrix descriptors of the upper triangular factor */ 991 matrixType = cusparseGetMatType(upTriFactor->descr); 992 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 993 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 994 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 995 diagType = cusparseGetMatDiagType(upTriFactor->descr); 996 997 /* Create the matrix description */ 998 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr)); 999 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 1000 PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType)); 1001 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode)); 1002 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType)); 1003 1004 /* set the operation */ 1005 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1006 1007 /* allocate GPU space for the CSC of the upper triangular factor*/ 1008 upTriFactorT->csrMat = new CsrMatrix; 1009 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1010 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1011 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1012 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1013 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1014 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1015 1016 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1017 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1018 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1019 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1020 upTriFactor->csrMat->values->data().get(), 1021 upTriFactor->csrMat->row_offsets->data().get(), 1022 upTriFactor->csrMat->column_indices->data().get(), 1023 upTriFactorT->csrMat->values->data().get(), 1024 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1025 CUSPARSE_ACTION_NUMERIC,indexBase, 1026 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 1027 PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize)); 1028 #endif 1029 1030 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1031 PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1032 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1033 upTriFactor->csrMat->values->data().get(), 1034 upTriFactor->csrMat->row_offsets->data().get(), 1035 upTriFactor->csrMat->column_indices->data().get(), 1036 upTriFactorT->csrMat->values->data().get(), 1037 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1038 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1039 CUSPARSE_ACTION_NUMERIC, indexBase, 1040 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer)); 1041 #else 1042 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1043 CUSPARSE_ACTION_NUMERIC, indexBase)); 1044 #endif 1045 1046 PetscCallCUDA(WaitForCUDA()); 1047 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1048 1049 /* Create the solve analysis information */ 1050 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1051 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo)); 1052 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1053 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1054 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1055 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1056 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1057 &upTriFactorT->solveBufferSize)); 1058 PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize)); 1059 #endif 1060 1061 /* perform the solve analysis */ 1062 /* christ, would it have killed you to put this stuff in a function????????? */ 1063 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1064 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1065 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1066 upTriFactorT->csrMat->column_indices->data().get(), 1067 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1068 upTriFactorT->solveInfo, 1069 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1070 #else 1071 upTriFactorT->solveInfo)); 1072 #endif 1073 1074 PetscCallCUDA(WaitForCUDA()); 1075 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1076 1077 /* assign the pointer */ 1078 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1079 PetscFunctionReturn(0); 1080 } 1081 1082 struct PetscScalarToPetscInt 1083 { 1084 __host__ __device__ 1085 PetscInt operator()(PetscScalar s) 1086 { 1087 return (PetscInt)PetscRealPart(s); 1088 } 1089 }; 1090 1091 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1092 { 1093 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1094 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1095 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1096 cusparseStatus_t stat; 1097 cusparseIndexBase_t indexBase; 1098 1099 PetscFunctionBegin; 1100 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1101 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1102 PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct"); 1103 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1104 PetscCheck(!A->transupdated || matstructT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct"); 1105 if (A->transupdated) PetscFunctionReturn(0); 1106 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1107 PetscCall(PetscLogGpuTimeBegin()); 1108 if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1109 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 1110 } 1111 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1112 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1113 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr)); 1114 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1115 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase)); 1116 PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1117 1118 /* set alpha and beta */ 1119 PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar))); 1120 PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar))); 1121 PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar))); 1122 PetscCallCUDA(cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1123 PetscCallCUDA(cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1124 PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1125 1126 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1127 CsrMatrix *matrixT = new CsrMatrix; 1128 matstructT->mat = matrixT; 1129 matrixT->num_rows = A->cmap->n; 1130 matrixT->num_cols = A->rmap->n; 1131 matrixT->num_entries = a->nz; 1132 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1133 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1134 matrixT->values = new THRUSTARRAY(a->nz); 1135 1136 if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 1137 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1138 1139 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1140 #if PETSC_PKG_CUDA_VERSION_GE(11,2,1) 1141 stat = cusparseCreateCsr(&matstructT->matDescr, 1142 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1143 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1144 matrixT->values->data().get(), 1145 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1146 indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat); 1147 #else 1148 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 1149 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 1150 1151 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 1152 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 1153 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 1154 */ 1155 if (matrixT->num_entries) { 1156 stat = cusparseCreateCsr(&matstructT->matDescr, 1157 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1158 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1159 matrixT->values->data().get(), 1160 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, 1161 indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat); 1162 1163 } else { 1164 matstructT->matDescr = NULL; 1165 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1166 } 1167 #endif 1168 #endif 1169 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1170 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1171 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1172 #else 1173 CsrMatrix *temp = new CsrMatrix; 1174 CsrMatrix *tempT = new CsrMatrix; 1175 /* First convert HYB to CSR */ 1176 temp->num_rows = A->rmap->n; 1177 temp->num_cols = A->cmap->n; 1178 temp->num_entries = a->nz; 1179 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1180 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1181 temp->values = new THRUSTARRAY(a->nz); 1182 1183 stat = cusparse_hyb2csr(cusparsestruct->handle, 1184 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1185 temp->values->data().get(), 1186 temp->row_offsets->data().get(), 1187 temp->column_indices->data().get());PetscCallCUSPARSE(stat); 1188 1189 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1190 tempT->num_rows = A->rmap->n; 1191 tempT->num_cols = A->cmap->n; 1192 tempT->num_entries = a->nz; 1193 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1194 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1195 tempT->values = new THRUSTARRAY(a->nz); 1196 1197 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1198 temp->num_cols, temp->num_entries, 1199 temp->values->data().get(), 1200 temp->row_offsets->data().get(), 1201 temp->column_indices->data().get(), 1202 tempT->values->data().get(), 1203 tempT->column_indices->data().get(), 1204 tempT->row_offsets->data().get(), 1205 CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat); 1206 1207 /* Last, convert CSC to HYB */ 1208 cusparseHybMat_t hybMat; 1209 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 1210 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1211 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1212 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1213 matstructT->descr, tempT->values->data().get(), 1214 tempT->row_offsets->data().get(), 1215 tempT->column_indices->data().get(), 1216 hybMat, 0, partition);PetscCallCUSPARSE(stat); 1217 1218 /* assign the pointer */ 1219 matstructT->mat = hybMat; 1220 A->transupdated = PETSC_TRUE; 1221 /* delete temporaries */ 1222 if (tempT) { 1223 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1224 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1225 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1226 delete (CsrMatrix*) tempT; 1227 } 1228 if (temp) { 1229 if (temp->values) delete (THRUSTARRAY*) temp->values; 1230 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1231 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1232 delete (CsrMatrix*) temp; 1233 } 1234 #endif 1235 } 1236 } 1237 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1238 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1239 CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1240 PetscCheck(matrix,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix"); 1241 PetscCheck(matrix->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows"); 1242 PetscCheck(matrix->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols"); 1243 PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values"); 1244 PetscCheck(matrixT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT"); 1245 PetscCheck(matrixT->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows"); 1246 PetscCheck(matrixT->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols"); 1247 PetscCheck(matrixT->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values"); 1248 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1249 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1250 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1251 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 1252 } 1253 if (!cusparsestruct->csr2csc_i) { 1254 THRUSTARRAY csr2csc_a(matrix->num_entries); 1255 PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1256 1257 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1258 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1259 void *csr2cscBuffer; 1260 size_t csr2cscBufferSize; 1261 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1262 A->cmap->n, matrix->num_entries, 1263 matrix->values->data().get(), 1264 cusparsestruct->rowoffsets_gpu->data().get(), 1265 matrix->column_indices->data().get(), 1266 matrixT->values->data().get(), 1267 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1268 CUSPARSE_ACTION_NUMERIC,indexBase, 1269 cusparsestruct->csr2cscAlg, &csr2cscBufferSize);PetscCallCUSPARSE(stat); 1270 PetscCallCUDA(cudaMalloc(&csr2cscBuffer,csr2cscBufferSize)); 1271 #endif 1272 1273 if (matrix->num_entries) { 1274 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1275 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1276 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1277 1278 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1279 should be filled with indexBase. So I just take a shortcut here. 1280 */ 1281 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1282 A->cmap->n,matrix->num_entries, 1283 csr2csc_a.data().get(), 1284 cusparsestruct->rowoffsets_gpu->data().get(), 1285 matrix->column_indices->data().get(), 1286 matrixT->values->data().get(), 1287 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1288 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1289 CUSPARSE_ACTION_NUMERIC,indexBase, 1290 cusparsestruct->csr2cscAlg, csr2cscBuffer);PetscCallCUSPARSE(stat); 1291 #else 1292 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1293 CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat); 1294 #endif 1295 } else { 1296 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1297 } 1298 1299 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1300 PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1301 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1302 PetscCallCUDA(cudaFree(csr2cscBuffer)); 1303 #endif 1304 } 1305 PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1306 thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1307 matrixT->values->begin())); 1308 } 1309 PetscCall(PetscLogGpuTimeEnd()); 1310 PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1311 /* the compressed row indices is not used for matTranspose */ 1312 matstructT->cprowIndices = NULL; 1313 /* assign the pointer */ 1314 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1315 A->transupdated = PETSC_TRUE; 1316 PetscFunctionReturn(0); 1317 } 1318 1319 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1320 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1321 { 1322 PetscInt n = xx->map->n; 1323 const PetscScalar *barray; 1324 PetscScalar *xarray; 1325 thrust::device_ptr<const PetscScalar> bGPU; 1326 thrust::device_ptr<PetscScalar> xGPU; 1327 cusparseStatus_t stat; 1328 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1329 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1330 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1331 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1332 1333 PetscFunctionBegin; 1334 /* Analyze the matrix and create the transpose ... on the fly */ 1335 if (!loTriFactorT && !upTriFactorT) { 1336 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1337 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1338 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1339 } 1340 1341 /* Get the GPU pointers */ 1342 PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 1343 PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1344 xGPU = thrust::device_pointer_cast(xarray); 1345 bGPU = thrust::device_pointer_cast(barray); 1346 1347 PetscCall(PetscLogGpuTimeBegin()); 1348 /* First, reorder with the row permutation */ 1349 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1350 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1351 xGPU); 1352 1353 /* First, solve U */ 1354 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1355 upTriFactorT->csrMat->num_rows, 1356 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1357 upTriFactorT->csrMat->num_entries, 1358 #endif 1359 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1360 upTriFactorT->csrMat->values->data().get(), 1361 upTriFactorT->csrMat->row_offsets->data().get(), 1362 upTriFactorT->csrMat->column_indices->data().get(), 1363 upTriFactorT->solveInfo, 1364 xarray, 1365 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1366 tempGPU->data().get(), 1367 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1368 #else 1369 tempGPU->data().get());PetscCallCUSPARSE(stat); 1370 #endif 1371 1372 /* Then, solve L */ 1373 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1374 loTriFactorT->csrMat->num_rows, 1375 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1376 loTriFactorT->csrMat->num_entries, 1377 #endif 1378 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1379 loTriFactorT->csrMat->values->data().get(), 1380 loTriFactorT->csrMat->row_offsets->data().get(), 1381 loTriFactorT->csrMat->column_indices->data().get(), 1382 loTriFactorT->solveInfo, 1383 tempGPU->data().get(), 1384 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1385 xarray, 1386 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1387 #else 1388 xarray);PetscCallCUSPARSE(stat); 1389 #endif 1390 1391 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1392 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1393 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1394 tempGPU->begin()); 1395 1396 /* Copy the temporary to the full solution. */ 1397 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1398 1399 /* restore */ 1400 PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 1401 PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 1402 PetscCall(PetscLogGpuTimeEnd()); 1403 PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1404 PetscFunctionReturn(0); 1405 } 1406 1407 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1408 { 1409 const PetscScalar *barray; 1410 PetscScalar *xarray; 1411 cusparseStatus_t stat; 1412 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1413 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1414 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1415 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1416 1417 PetscFunctionBegin; 1418 /* Analyze the matrix and create the transpose ... on the fly */ 1419 if (!loTriFactorT && !upTriFactorT) { 1420 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1421 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1422 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1423 } 1424 1425 /* Get the GPU pointers */ 1426 PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 1427 PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1428 1429 PetscCall(PetscLogGpuTimeBegin()); 1430 /* First, solve U */ 1431 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1432 upTriFactorT->csrMat->num_rows, 1433 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1434 upTriFactorT->csrMat->num_entries, 1435 #endif 1436 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1437 upTriFactorT->csrMat->values->data().get(), 1438 upTriFactorT->csrMat->row_offsets->data().get(), 1439 upTriFactorT->csrMat->column_indices->data().get(), 1440 upTriFactorT->solveInfo, 1441 barray, 1442 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1443 tempGPU->data().get(), 1444 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1445 #else 1446 tempGPU->data().get());PetscCallCUSPARSE(stat); 1447 #endif 1448 1449 /* Then, solve L */ 1450 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1451 loTriFactorT->csrMat->num_rows, 1452 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1453 loTriFactorT->csrMat->num_entries, 1454 #endif 1455 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1456 loTriFactorT->csrMat->values->data().get(), 1457 loTriFactorT->csrMat->row_offsets->data().get(), 1458 loTriFactorT->csrMat->column_indices->data().get(), 1459 loTriFactorT->solveInfo, 1460 tempGPU->data().get(), 1461 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1462 xarray, 1463 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1464 #else 1465 xarray);PetscCallCUSPARSE(stat); 1466 #endif 1467 1468 /* restore */ 1469 PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 1470 PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 1471 PetscCall(PetscLogGpuTimeEnd()); 1472 PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1473 PetscFunctionReturn(0); 1474 } 1475 1476 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1477 { 1478 const PetscScalar *barray; 1479 PetscScalar *xarray; 1480 thrust::device_ptr<const PetscScalar> bGPU; 1481 thrust::device_ptr<PetscScalar> xGPU; 1482 cusparseStatus_t stat; 1483 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1484 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1485 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1486 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1487 1488 PetscFunctionBegin; 1489 1490 /* Get the GPU pointers */ 1491 PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 1492 PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1493 xGPU = thrust::device_pointer_cast(xarray); 1494 bGPU = thrust::device_pointer_cast(barray); 1495 1496 PetscCall(PetscLogGpuTimeBegin()); 1497 /* First, reorder with the row permutation */ 1498 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1499 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1500 tempGPU->begin()); 1501 1502 /* Next, solve L */ 1503 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1504 loTriFactor->csrMat->num_rows, 1505 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1506 loTriFactor->csrMat->num_entries, 1507 #endif 1508 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1509 loTriFactor->csrMat->values->data().get(), 1510 loTriFactor->csrMat->row_offsets->data().get(), 1511 loTriFactor->csrMat->column_indices->data().get(), 1512 loTriFactor->solveInfo, 1513 tempGPU->data().get(), 1514 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1515 xarray, 1516 loTriFactor->solvePolicy, loTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1517 #else 1518 xarray);PetscCallCUSPARSE(stat); 1519 #endif 1520 1521 /* Then, solve U */ 1522 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1523 upTriFactor->csrMat->num_rows, 1524 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1525 upTriFactor->csrMat->num_entries, 1526 #endif 1527 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1528 upTriFactor->csrMat->values->data().get(), 1529 upTriFactor->csrMat->row_offsets->data().get(), 1530 upTriFactor->csrMat->column_indices->data().get(), 1531 upTriFactor->solveInfo,xarray, 1532 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1533 tempGPU->data().get(), 1534 upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1535 #else 1536 tempGPU->data().get());PetscCallCUSPARSE(stat); 1537 #endif 1538 1539 /* Last, reorder with the column permutation */ 1540 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1541 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1542 xGPU); 1543 1544 PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 1545 PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 1546 PetscCall(PetscLogGpuTimeEnd()); 1547 PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1548 PetscFunctionReturn(0); 1549 } 1550 1551 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1552 { 1553 const PetscScalar *barray; 1554 PetscScalar *xarray; 1555 cusparseStatus_t stat; 1556 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1557 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1558 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1559 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1560 1561 PetscFunctionBegin; 1562 /* Get the GPU pointers */ 1563 PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 1564 PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1565 1566 PetscCall(PetscLogGpuTimeBegin()); 1567 /* First, solve L */ 1568 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1569 loTriFactor->csrMat->num_rows, 1570 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1571 loTriFactor->csrMat->num_entries, 1572 #endif 1573 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1574 loTriFactor->csrMat->values->data().get(), 1575 loTriFactor->csrMat->row_offsets->data().get(), 1576 loTriFactor->csrMat->column_indices->data().get(), 1577 loTriFactor->solveInfo, 1578 barray, 1579 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1580 tempGPU->data().get(), 1581 loTriFactor->solvePolicy,loTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1582 #else 1583 tempGPU->data().get());PetscCallCUSPARSE(stat); 1584 #endif 1585 1586 /* Next, solve U */ 1587 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1588 upTriFactor->csrMat->num_rows, 1589 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1590 upTriFactor->csrMat->num_entries, 1591 #endif 1592 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1593 upTriFactor->csrMat->values->data().get(), 1594 upTriFactor->csrMat->row_offsets->data().get(), 1595 upTriFactor->csrMat->column_indices->data().get(), 1596 upTriFactor->solveInfo, 1597 tempGPU->data().get(), 1598 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1599 xarray, 1600 upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1601 #else 1602 xarray);PetscCallCUSPARSE(stat); 1603 #endif 1604 1605 PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 1606 PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 1607 PetscCall(PetscLogGpuTimeEnd()); 1608 PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1609 PetscFunctionReturn(0); 1610 } 1611 1612 #if CUSPARSE_VERSION >= 11500 1613 /* cusparseSpSV_solve() and friends first appeared in cusparse-11.3 */ 1614 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ILU0(Mat fact,Vec b,Vec x) 1615 { 1616 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr; 1617 Mat_SeqAIJ *aij = (Mat_SeqAIJ*)fact->data; 1618 const PetscScalar *barray; 1619 PetscScalar *xarray; 1620 1621 PetscFunctionBegin; 1622 PetscCall(VecCUDAGetArrayWrite(x,&xarray)); 1623 PetscCall(VecCUDAGetArrayRead(b,&barray)); 1624 PetscCall(PetscLogGpuTimeBegin()); 1625 1626 /* Solve L*y = b */ 1627 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,(void*)barray)); 1628 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y,fs->Y)); 1629 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, 1630 CUSPARSE_OPERATION_NON_TRANSPOSE, 1631 &PETSC_CUSPARSE_ONE, 1632 fs->spMatDescr_L, /* L Y = X */ 1633 fs->dnVecDescr_X, 1634 fs->dnVecDescr_Y, 1635 cusparse_scalartype, 1636 CUSPARSE_SPSV_ALG_DEFAULT, 1637 fs->spsvDescr_L)); 1638 1639 /* Solve U*x = y */ 1640 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,xarray)); 1641 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, 1642 CUSPARSE_OPERATION_NON_TRANSPOSE, 1643 &PETSC_CUSPARSE_ONE, 1644 fs->spMatDescr_U, /* U X = Y */ 1645 fs->dnVecDescr_Y, 1646 fs->dnVecDescr_X, 1647 cusparse_scalartype, 1648 CUSPARSE_SPSV_ALG_DEFAULT, 1649 fs->spsvDescr_U)); 1650 1651 PetscCall(VecCUDARestoreArrayRead(b,&barray)); 1652 PetscCall(VecCUDARestoreArrayWrite(x,&xarray)); 1653 1654 PetscCall(PetscLogGpuTimeEnd()); 1655 PetscCall(PetscLogGpuFlops(2.0*aij->nz - fact->rmap->n)); 1656 PetscFunctionReturn(0); 1657 } 1658 1659 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_ILU0(Mat fact,Vec b,Vec x) 1660 { 1661 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr; 1662 Mat_SeqAIJ *aij = (Mat_SeqAIJ*)fact->data; 1663 const PetscScalar *barray; 1664 PetscScalar *xarray; 1665 1666 PetscFunctionBegin; 1667 if (!fs->builtSolveTranspose) { /* Call MatSolveTranspose() for the first time */ 1668 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 1669 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, 1670 CUSPARSE_OPERATION_TRANSPOSE, 1671 &PETSC_CUSPARSE_ONE, 1672 fs->spMatDescr_L, /* The matrix is still L. We only do tranpose solve with it */ 1673 fs->dnVecDescr_X, 1674 fs->dnVecDescr_Y, 1675 cusparse_scalartype, 1676 CUSPARSE_SPSV_ALG_DEFAULT, 1677 fs->spsvDescr_Lt, 1678 &fs->spsvBufferSize_Lt)); 1679 1680 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); 1681 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, 1682 CUSPARSE_OPERATION_TRANSPOSE, 1683 &PETSC_CUSPARSE_ONE, 1684 fs->spMatDescr_U, 1685 fs->dnVecDescr_X, 1686 fs->dnVecDescr_Y, 1687 cusparse_scalartype, 1688 CUSPARSE_SPSV_ALG_DEFAULT, 1689 fs->spsvDescr_Ut, 1690 &fs->spsvBufferSize_Ut)); 1691 1692 PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_Ut,fs->spsvBufferSize_Ut)); 1693 PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_Lt,fs->spsvBufferSize_Lt)); 1694 1695 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, 1696 CUSPARSE_OPERATION_TRANSPOSE, 1697 &PETSC_CUSPARSE_ONE, 1698 fs->spMatDescr_L, 1699 fs->dnVecDescr_X, 1700 fs->dnVecDescr_Y, 1701 cusparse_scalartype, 1702 CUSPARSE_SPSV_ALG_DEFAULT, 1703 fs->spsvDescr_Lt, 1704 fs->spsvBuffer_Lt)); 1705 1706 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, 1707 CUSPARSE_OPERATION_TRANSPOSE, 1708 &PETSC_CUSPARSE_ONE, 1709 fs->spMatDescr_U, 1710 fs->dnVecDescr_X, 1711 fs->dnVecDescr_Y, 1712 cusparse_scalartype, 1713 CUSPARSE_SPSV_ALG_DEFAULT, 1714 fs->spsvDescr_Ut, 1715 fs->spsvBuffer_Ut)); 1716 fs->builtSolveTranspose = PETSC_TRUE; 1717 } 1718 1719 PetscCall(VecCUDAGetArrayWrite(x,&xarray)); 1720 PetscCall(VecCUDAGetArrayRead(b,&barray)); 1721 PetscCall(PetscLogGpuTimeBegin()); 1722 1723 /* Solve Ut*y = b */ 1724 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,(void*)barray)); 1725 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y,fs->Y)); 1726 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, 1727 CUSPARSE_OPERATION_TRANSPOSE, 1728 &PETSC_CUSPARSE_ONE, 1729 fs->spMatDescr_U, /* Ut Y = X */ 1730 fs->dnVecDescr_X, 1731 fs->dnVecDescr_Y, 1732 cusparse_scalartype, 1733 CUSPARSE_SPSV_ALG_DEFAULT, 1734 fs->spsvDescr_Ut)); 1735 1736 /* Solve Lt*x = y */ 1737 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,xarray)); 1738 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, 1739 CUSPARSE_OPERATION_TRANSPOSE, 1740 &PETSC_CUSPARSE_ONE, 1741 fs->spMatDescr_L, /* Lt X = Y */ 1742 fs->dnVecDescr_Y, 1743 fs->dnVecDescr_X, 1744 cusparse_scalartype, 1745 CUSPARSE_SPSV_ALG_DEFAULT, 1746 fs->spsvDescr_Lt)); 1747 1748 PetscCall(VecCUDARestoreArrayRead(b,&barray)); 1749 PetscCall(VecCUDARestoreArrayWrite(x,&xarray)); 1750 PetscCall(PetscLogGpuTimeEnd()); 1751 PetscCall(PetscLogGpuFlops(2.0*aij->nz - fact->rmap->n)); 1752 PetscFunctionReturn(0); 1753 } 1754 1755 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact,Mat A,const MatFactorInfo *info) 1756 { 1757 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr; 1758 Mat_SeqAIJ *aij = (Mat_SeqAIJ*)fact->data; 1759 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1760 CsrMatrix *Acsr; 1761 PetscInt m,nz; 1762 PetscBool flg; 1763 1764 PetscFunctionBegin; 1765 if (PetscDefined(USE_DEBUG)) { 1766 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 1767 PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name); 1768 } 1769 1770 /* Copy A's value to fact */ 1771 m = fact->rmap->n; 1772 nz = aij->nz; 1773 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1774 Acsr = (CsrMatrix*)Acusp->mat->mat; 1775 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal,Acsr->values->data().get(),sizeof(PetscScalar)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream)); 1776 1777 /* Factorize fact inplace */ 1778 if (m) PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1779 fs->matDescr_M, 1780 fs->csrVal, 1781 fs->csrRowPtr, 1782 fs->csrColIdx, 1783 fs->ilu0Info_M, 1784 fs->policy_M, 1785 fs->factBuffer_M)); 1786 if (PetscDefined(USE_DEBUG)) { 1787 int numerical_zero; 1788 cusparseStatus_t status; 1789 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero); 1790 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Numerical zero pivot detected in csrilu02: A(%d,%d) is zero",numerical_zero,numerical_zero); 1791 } 1792 1793 /* From my experiment, cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02() */ 1794 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, 1795 CUSPARSE_OPERATION_NON_TRANSPOSE, 1796 &PETSC_CUSPARSE_ONE, 1797 fs->spMatDescr_L, 1798 fs->dnVecDescr_X, 1799 fs->dnVecDescr_Y, 1800 cusparse_scalartype, 1801 CUSPARSE_SPSV_ALG_DEFAULT, 1802 fs->spsvDescr_L, 1803 fs->spsvBuffer_L)); 1804 1805 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, 1806 CUSPARSE_OPERATION_NON_TRANSPOSE, 1807 &PETSC_CUSPARSE_ONE, 1808 fs->spMatDescr_U, 1809 fs->dnVecDescr_X, 1810 fs->dnVecDescr_Y, 1811 cusparse_scalartype, 1812 CUSPARSE_SPSV_ALG_DEFAULT, 1813 fs->spsvDescr_U, 1814 fs->spsvBuffer_U)); 1815 1816 fact->offloadmask = PETSC_OFFLOAD_GPU; 1817 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ILU0; 1818 fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_ILU0; 1819 fact->ops->matsolve = NULL; 1820 fact->ops->matsolvetranspose = NULL; 1821 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1822 PetscFunctionReturn(0); 1823 } 1824 1825 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1826 { 1827 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr; 1828 Mat_SeqAIJ *aij = (Mat_SeqAIJ*)fact->data; 1829 PetscInt m,nz; 1830 1831 PetscFunctionBegin; 1832 if (PetscDefined(USE_DEBUG)) { 1833 PetscInt i; 1834 PetscBool flg,missing; 1835 1836 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 1837 PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name); 1838 PetscCheck(A->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT,A->rmap->n,A->cmap->n); 1839 PetscCall(MatMissingDiagonal(A,&missing,&i)); 1840 PetscCheck(!missing,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %" PetscInt_FMT,i); 1841 } 1842 1843 /* Free the old stale stuff */ 1844 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 1845 1846 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1847 but they will not be used. Allocate them just for easy debugging. 1848 */ 1849 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE/*malloc*/)); 1850 1851 fact->offloadmask = PETSC_OFFLOAD_BOTH; 1852 fact->factortype = MAT_FACTOR_ILU; 1853 fact->info.factor_mallocs = 0; 1854 fact->info.fill_ratio_given = info->fill; 1855 fact->info.fill_ratio_needed = 1.0; 1856 1857 aij->row = NULL; 1858 aij->col = NULL; 1859 1860 /* ====================================================================== */ 1861 /* Copy A's i, j to fact and also allocate the value array of fact. */ 1862 /* We'll do in-place factorization on fact */ 1863 /* ====================================================================== */ 1864 const int *Ai,*Aj; 1865 1866 m = fact->rmap->n; 1867 nz = aij->nz; 1868 1869 PetscCallCUDA(cudaMalloc((void**)&fs->csrRowPtr,sizeof(int)*(m+1))); 1870 PetscCallCUDA(cudaMalloc((void**)&fs->csrColIdx,sizeof(int)*nz)); 1871 PetscCallCUDA(cudaMalloc((void**)&fs->csrVal,sizeof(PetscScalar)*nz)); 1872 PetscCall(MatSeqAIJCUSPARSEGetIJ(A,PETSC_FALSE,&Ai,&Aj)); /* Do not use compressed Ai */ 1873 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr,Ai,sizeof(int)*(m+1),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream)); 1874 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx,Aj,sizeof(int)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream)); 1875 1876 /* ====================================================================== */ 1877 /* Create descriptors for M, L, U */ 1878 /* ====================================================================== */ 1879 cusparseFillMode_t fillMode; 1880 cusparseDiagType_t diagType; 1881 1882 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 1883 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 1884 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 1885 1886 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 1887 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1888 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1889 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1890 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1891 */ 1892 fillMode = CUSPARSE_FILL_MODE_LOWER; 1893 diagType = CUSPARSE_DIAG_TYPE_UNIT; 1894 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L,m,m,nz, 1895 fs->csrRowPtr, 1896 fs->csrColIdx, 1897 fs->csrVal, 1898 CUSPARSE_INDEX_32I, 1899 CUSPARSE_INDEX_32I, 1900 CUSPARSE_INDEX_BASE_ZERO, 1901 cusparse_scalartype)); 1902 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, 1903 CUSPARSE_SPMAT_FILL_MODE, 1904 &fillMode, 1905 sizeof(fillMode))); 1906 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, 1907 CUSPARSE_SPMAT_DIAG_TYPE, 1908 &diagType, 1909 sizeof(diagType))); 1910 1911 fillMode = CUSPARSE_FILL_MODE_UPPER; 1912 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 1913 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U,m,m,nz, 1914 fs->csrRowPtr, 1915 fs->csrColIdx, 1916 fs->csrVal, 1917 CUSPARSE_INDEX_32I, 1918 CUSPARSE_INDEX_32I, 1919 CUSPARSE_INDEX_BASE_ZERO, 1920 cusparse_scalartype)); 1921 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, 1922 CUSPARSE_SPMAT_FILL_MODE, 1923 &fillMode, 1924 sizeof(fillMode))); 1925 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, 1926 CUSPARSE_SPMAT_DIAG_TYPE, 1927 &diagType, 1928 sizeof(diagType))); 1929 1930 /* ========================================================================= */ 1931 /* Query buffer sizes for csrilu0, SpSV and allocate buffers */ 1932 /* ========================================================================= */ 1933 PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M)); 1934 if (m) PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1935 fs->matDescr_M, 1936 fs->csrVal, 1937 fs->csrRowPtr, 1938 fs->csrColIdx, 1939 fs->ilu0Info_M, 1940 &fs->factBufferSize_M)); 1941 1942 PetscCallCUDA(cudaMalloc((void**)&fs->X,sizeof(PetscScalar)*m)); 1943 PetscCallCUDA(cudaMalloc((void**)&fs->Y,sizeof(PetscScalar)*m)); 1944 1945 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X,m,fs->X,cusparse_scalartype)); 1946 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y,m,fs->Y,cusparse_scalartype)); 1947 1948 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 1949 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, 1950 CUSPARSE_OPERATION_NON_TRANSPOSE, 1951 &PETSC_CUSPARSE_ONE, 1952 fs->spMatDescr_L, 1953 fs->dnVecDescr_X, 1954 fs->dnVecDescr_Y, 1955 cusparse_scalartype, 1956 CUSPARSE_SPSV_ALG_DEFAULT, 1957 fs->spsvDescr_L, 1958 &fs->spsvBufferSize_L)); 1959 1960 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 1961 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, 1962 CUSPARSE_OPERATION_NON_TRANSPOSE, 1963 &PETSC_CUSPARSE_ONE, 1964 fs->spMatDescr_U, 1965 fs->dnVecDescr_X, 1966 fs->dnVecDescr_Y, 1967 cusparse_scalartype, 1968 CUSPARSE_SPSV_ALG_DEFAULT, 1969 fs->spsvDescr_U, 1970 &fs->spsvBufferSize_U)); 1971 1972 /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab, 1973 spsvBuffer_L and spsvBuffer_U can not be shared. 1974 */ 1975 PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_U,fs->spsvBufferSize_U)); 1976 PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_L,fs->spsvBufferSize_L)); 1977 PetscCallCUDA(cudaMalloc((void**)&fs->factBuffer_M,fs->factBufferSize_M)); 1978 1979 /* ========================================================================== */ 1980 /* Perform analysis of ilu0 on M, SpSv on L and U */ 1981 /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/ 1982 /* ========================================================================== */ 1983 int structural_zero; 1984 cusparseStatus_t status; 1985 1986 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1987 if (m) PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1988 fs->matDescr_M, 1989 fs->csrVal, 1990 fs->csrRowPtr, 1991 fs->csrColIdx, 1992 fs->ilu0Info_M, 1993 fs->policy_M, 1994 fs->factBuffer_M)); 1995 if (PetscDefined(USE_DEBUG)) { 1996 /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 1997 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero); 1998 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Structural zero pivot detected in csrilu02: A(%d,%d) is missing",structural_zero,structural_zero); 1999 } 2000 2001 /* Estimate FLOPs of the numeric factorization */ 2002 { 2003 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ*)A->data; 2004 PetscInt *Ai,*Adiag,nzRow,nzLeft; 2005 PetscLogDouble flops = 0.0; 2006 2007 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 2008 Ai = Aseq->i; 2009 Adiag = Aseq->diag; 2010 for (PetscInt i=0; i<m; i++) { 2011 if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i+1]) { /* There are nonzeros left to the diagonal of row i */ 2012 nzRow = Ai[i+1] - Ai[i]; 2013 nzLeft = Adiag[i] - Ai[i]; 2014 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 2015 and include the eliminated one will be updated, which incurs a multiplication and an addition. 2016 */ 2017 nzLeft = (nzRow-1)/2; 2018 flops += nzLeft*(2.0*nzRow-nzLeft+1); 2019 } 2020 } 2021 fs->numericFactFlops = flops; 2022 } 2023 fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0; 2024 PetscFunctionReturn(0); 2025 } 2026 2027 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact,Vec b,Vec x) 2028 { 2029 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr; 2030 Mat_SeqAIJ *aij = (Mat_SeqAIJ*)fact->data; 2031 const PetscScalar *barray; 2032 PetscScalar *xarray; 2033 2034 PetscFunctionBegin; 2035 PetscCall(VecCUDAGetArrayWrite(x,&xarray)); 2036 PetscCall(VecCUDAGetArrayRead(b,&barray)); 2037 PetscCall(PetscLogGpuTimeBegin()); 2038 2039 /* Solve L*y = b */ 2040 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,(void*)barray)); 2041 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y,fs->Y)); 2042 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, 2043 CUSPARSE_OPERATION_NON_TRANSPOSE, 2044 &PETSC_CUSPARSE_ONE, 2045 fs->spMatDescr_L, /* L Y = X */ 2046 fs->dnVecDescr_X, 2047 fs->dnVecDescr_Y, 2048 cusparse_scalartype, 2049 CUSPARSE_SPSV_ALG_DEFAULT, 2050 fs->spsvDescr_L)); 2051 2052 /* Solve Lt*x = y */ 2053 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,xarray)); 2054 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, 2055 CUSPARSE_OPERATION_TRANSPOSE, 2056 &PETSC_CUSPARSE_ONE, 2057 fs->spMatDescr_L, /* Lt X = Y */ 2058 fs->dnVecDescr_Y, 2059 fs->dnVecDescr_X, 2060 cusparse_scalartype, 2061 CUSPARSE_SPSV_ALG_DEFAULT, 2062 fs->spsvDescr_Lt)); 2063 2064 PetscCall(VecCUDARestoreArrayRead(b,&barray)); 2065 PetscCall(VecCUDARestoreArrayWrite(x,&xarray)); 2066 2067 PetscCall(PetscLogGpuTimeEnd()); 2068 PetscCall(PetscLogGpuFlops(2.0*aij->nz - fact->rmap->n)); 2069 PetscFunctionReturn(0); 2070 } 2071 2072 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact,Mat A,const MatFactorInfo *info) 2073 { 2074 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr; 2075 Mat_SeqAIJ *aij = (Mat_SeqAIJ*)fact->data; 2076 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2077 CsrMatrix *Acsr; 2078 PetscInt m,nz; 2079 PetscBool flg; 2080 2081 PetscFunctionBegin; 2082 if (PetscDefined(USE_DEBUG)) { 2083 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2084 PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name); 2085 } 2086 2087 /* Copy A's value to fact */ 2088 m = fact->rmap->n; 2089 nz = aij->nz; 2090 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2091 Acsr = (CsrMatrix*)Acusp->mat->mat; 2092 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal,Acsr->values->data().get(),sizeof(PetscScalar)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream)); 2093 2094 /* Factorize fact inplace */ 2095 /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve 2096 Function csric02() only takes the lower triangular part of matrix A to perform factorization. 2097 The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored, 2098 and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not. 2099 In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided. 2100 */ 2101 if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, 2102 fs->matDescr_M, 2103 fs->csrVal, 2104 fs->csrRowPtr, 2105 fs->csrColIdx, 2106 fs->ic0Info_M, 2107 fs->policy_M, 2108 fs->factBuffer_M)); 2109 if (PetscDefined(USE_DEBUG)) { 2110 int numerical_zero; 2111 cusparseStatus_t status; 2112 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero); 2113 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Numerical zero pivot detected in csric02: A(%d,%d) is zero",numerical_zero,numerical_zero); 2114 } 2115 2116 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, 2117 CUSPARSE_OPERATION_NON_TRANSPOSE, 2118 &PETSC_CUSPARSE_ONE, 2119 fs->spMatDescr_L, 2120 fs->dnVecDescr_X, 2121 fs->dnVecDescr_Y, 2122 cusparse_scalartype, 2123 CUSPARSE_SPSV_ALG_DEFAULT, 2124 fs->spsvDescr_L, 2125 fs->spsvBuffer_L)); 2126 2127 /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE 2128 ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F 2129 */ 2130 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, 2131 CUSPARSE_OPERATION_TRANSPOSE, 2132 &PETSC_CUSPARSE_ONE, 2133 fs->spMatDescr_L, 2134 fs->dnVecDescr_X, 2135 fs->dnVecDescr_Y, 2136 cusparse_scalartype, 2137 CUSPARSE_SPSV_ALG_DEFAULT, 2138 fs->spsvDescr_Lt, 2139 fs->spsvBuffer_Lt)); 2140 2141 fact->offloadmask = PETSC_OFFLOAD_GPU; 2142 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0; 2143 fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0; 2144 fact->ops->matsolve = NULL; 2145 fact->ops->matsolvetranspose = NULL; 2146 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 2147 PetscFunctionReturn(0); 2148 } 2149 2150 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact,Mat A,IS perm,const MatFactorInfo *info) 2151 { 2152 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr; 2153 Mat_SeqAIJ *aij = (Mat_SeqAIJ*)fact->data; 2154 PetscInt m,nz; 2155 2156 PetscFunctionBegin; 2157 if (PetscDefined(USE_DEBUG)) { 2158 PetscInt i; 2159 PetscBool flg,missing; 2160 2161 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2162 PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name); 2163 PetscCheck(A->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT,A->rmap->n,A->cmap->n); 2164 PetscCall(MatMissingDiagonal(A,&missing,&i)); 2165 PetscCheck(!missing,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %" PetscInt_FMT,i); 2166 } 2167 2168 /* Free the old stale stuff */ 2169 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 2170 2171 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 2172 but they will not be used. Allocate them just for easy debugging. 2173 */ 2174 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE/*malloc*/)); 2175 2176 fact->offloadmask = PETSC_OFFLOAD_BOTH; 2177 fact->factortype = MAT_FACTOR_ICC; 2178 fact->info.factor_mallocs = 0; 2179 fact->info.fill_ratio_given = info->fill; 2180 fact->info.fill_ratio_needed = 1.0; 2181 2182 aij->row = NULL; 2183 aij->col = NULL; 2184 2185 /* ====================================================================== */ 2186 /* Copy A's i, j to fact and also allocate the value array of fact. */ 2187 /* We'll do in-place factorization on fact */ 2188 /* ====================================================================== */ 2189 const int *Ai,*Aj; 2190 2191 m = fact->rmap->n; 2192 nz = aij->nz; 2193 2194 PetscCallCUDA(cudaMalloc((void**)&fs->csrRowPtr,sizeof(int)*(m+1))); 2195 PetscCallCUDA(cudaMalloc((void**)&fs->csrColIdx,sizeof(int)*nz)); 2196 PetscCallCUDA(cudaMalloc((void**)&fs->csrVal,sizeof(PetscScalar)*nz)); 2197 PetscCall(MatSeqAIJCUSPARSEGetIJ(A,PETSC_FALSE,&Ai,&Aj)); /* Do not use compressed Ai */ 2198 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr,Ai,sizeof(int)*(m+1),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream)); 2199 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx,Aj,sizeof(int)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream)); 2200 2201 /* ====================================================================== */ 2202 /* Create mat descriptors for M, L */ 2203 /* ====================================================================== */ 2204 cusparseFillMode_t fillMode; 2205 cusparseDiagType_t diagType; 2206 2207 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 2208 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 2209 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 2210 2211 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 2212 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 2213 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 2214 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 2215 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 2216 */ 2217 fillMode = CUSPARSE_FILL_MODE_LOWER; 2218 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 2219 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L,m,m,nz, 2220 fs->csrRowPtr, 2221 fs->csrColIdx, 2222 fs->csrVal, 2223 CUSPARSE_INDEX_32I, 2224 CUSPARSE_INDEX_32I, 2225 CUSPARSE_INDEX_BASE_ZERO, 2226 cusparse_scalartype)); 2227 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, 2228 CUSPARSE_SPMAT_FILL_MODE, 2229 &fillMode, 2230 sizeof(fillMode))); 2231 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, 2232 CUSPARSE_SPMAT_DIAG_TYPE, 2233 &diagType, 2234 sizeof(diagType))); 2235 2236 /* ========================================================================= */ 2237 /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */ 2238 /* ========================================================================= */ 2239 PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M)); 2240 if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, 2241 fs->matDescr_M, 2242 fs->csrVal, 2243 fs->csrRowPtr, 2244 fs->csrColIdx, 2245 fs->ic0Info_M, 2246 &fs->factBufferSize_M)); 2247 2248 PetscCallCUDA(cudaMalloc((void**)&fs->X,sizeof(PetscScalar)*m)); 2249 PetscCallCUDA(cudaMalloc((void**)&fs->Y,sizeof(PetscScalar)*m)); 2250 2251 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X,m,fs->X,cusparse_scalartype)); 2252 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y,m,fs->Y,cusparse_scalartype)); 2253 2254 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 2255 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, 2256 CUSPARSE_OPERATION_NON_TRANSPOSE, 2257 &PETSC_CUSPARSE_ONE, 2258 fs->spMatDescr_L, 2259 fs->dnVecDescr_X, 2260 fs->dnVecDescr_Y, 2261 cusparse_scalartype, 2262 CUSPARSE_SPSV_ALG_DEFAULT, 2263 fs->spsvDescr_L, 2264 &fs->spsvBufferSize_L)); 2265 2266 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 2267 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, 2268 CUSPARSE_OPERATION_TRANSPOSE, 2269 &PETSC_CUSPARSE_ONE, 2270 fs->spMatDescr_L, 2271 fs->dnVecDescr_X, 2272 fs->dnVecDescr_Y, 2273 cusparse_scalartype, 2274 CUSPARSE_SPSV_ALG_DEFAULT, 2275 fs->spsvDescr_Lt, 2276 &fs->spsvBufferSize_Lt)); 2277 2278 PetscCallCUDA(cudaMalloc((void**)&fs->factBuffer_M,fs->factBufferSize_M)); 2279 PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_L,fs->spsvBufferSize_L)); 2280 PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_Lt,fs->spsvBufferSize_Lt)); 2281 2282 /* ========================================================================== */ 2283 /* Perform analysis of ic0 on M */ 2284 /* The lower triangular part of M has the same sparsity pattern as L */ 2285 /* ========================================================================== */ 2286 int structural_zero; 2287 cusparseStatus_t status; 2288 2289 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 2290 if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, 2291 fs->matDescr_M, 2292 fs->csrVal, 2293 fs->csrRowPtr, 2294 fs->csrColIdx, 2295 fs->ic0Info_M, 2296 fs->policy_M, 2297 fs->factBuffer_M)); 2298 if (PetscDefined(USE_DEBUG)) { 2299 /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 2300 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero); 2301 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Structural zero pivot detected in csric02: A(%d,%d) is missing",structural_zero,structural_zero); 2302 } 2303 2304 /* Estimate FLOPs of the numeric factorization */ 2305 { 2306 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ*)A->data; 2307 PetscInt *Ai,nzRow,nzLeft; 2308 PetscLogDouble flops = 0.0; 2309 2310 Ai = Aseq->i; 2311 for (PetscInt i=0; i<m; i++) { 2312 nzRow = Ai[i+1] - Ai[i]; 2313 if (nzRow > 1) { 2314 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 2315 and include the eliminated one will be updated, which incurs a multiplication and an addition. 2316 */ 2317 nzLeft = (nzRow-1)/2; 2318 flops += nzLeft*(2.0*nzRow-nzLeft+1); 2319 } 2320 } 2321 fs->numericFactFlops = flops; 2322 } 2323 fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0; 2324 PetscFunctionReturn(0); 2325 } 2326 #endif 2327 2328 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 2329 { 2330 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 2331 2332 PetscFunctionBegin; 2333 #if CUSPARSE_VERSION >= 11500 2334 PetscBool row_identity,col_identity; 2335 PetscCall(ISIdentity(isrow,&row_identity)); 2336 PetscCall(ISIdentity(iscol,&col_identity)); 2337 if (!info->levels && row_identity && col_identity) { 2338 PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B,A,isrow,iscol,info)); 2339 } else 2340 #endif 2341 { 2342 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2343 PetscCall(MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 2344 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2345 } 2346 PetscFunctionReturn(0); 2347 } 2348 2349 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 2350 { 2351 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 2352 2353 PetscFunctionBegin; 2354 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2355 PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 2356 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2357 PetscFunctionReturn(0); 2358 } 2359 2360 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 2361 { 2362 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 2363 2364 PetscFunctionBegin; 2365 #if CUSPARSE_VERSION >= 11500 2366 PetscBool perm_identity; 2367 PetscCall(ISIdentity(perm,&perm_identity)); 2368 if (!info->levels && perm_identity) { 2369 PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B,A,perm,info)); 2370 } else 2371 #endif 2372 { 2373 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2374 PetscCall(MatICCFactorSymbolic_SeqAIJ(B,A,perm,info)); 2375 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2376 } 2377 PetscFunctionReturn(0); 2378 } 2379 2380 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 2381 { 2382 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 2383 2384 PetscFunctionBegin; 2385 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2386 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info)); 2387 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2388 PetscFunctionReturn(0); 2389 } 2390 2391 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 2392 { 2393 PetscFunctionBegin; 2394 *type = MATSOLVERCUSPARSE; 2395 PetscFunctionReturn(0); 2396 } 2397 2398 /*MC 2399 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 2400 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 2401 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 2402 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 2403 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 2404 algorithms are not recommended. This class does NOT support direct solver operations. 2405 2406 Level: beginner 2407 2408 .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 2409 M*/ 2410 2411 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 2412 { 2413 PetscInt n = A->rmap->n; 2414 2415 PetscFunctionBegin; 2416 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B)); 2417 PetscCall(MatSetSizes(*B,n,n,n,n)); 2418 (*B)->factortype = ftype; 2419 PetscCall(MatSetType(*B,MATSEQAIJCUSPARSE)); 2420 2421 if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B,PETSC_TRUE)); 2422 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2423 PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 2424 if (!A->boundtocpu) { 2425 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 2426 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 2427 } else { 2428 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 2429 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 2430 } 2431 PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU])); 2432 PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2433 PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2434 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2435 if (!A->boundtocpu) { 2436 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 2437 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 2438 } else { 2439 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 2440 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 2441 } 2442 PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2443 PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2444 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 2445 2446 PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL)); 2447 (*B)->canuseordering = PETSC_TRUE; 2448 PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse)); 2449 PetscFunctionReturn(0); 2450 } 2451 2452 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 2453 { 2454 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2455 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2456 #if CUSPARSE_VERSION >= 13500 2457 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 2458 #endif 2459 2460 PetscFunctionBegin; 2461 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 2462 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0)); 2463 if (A->factortype == MAT_FACTOR_NONE) { 2464 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 2465 PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2466 } 2467 #if CUSPARSE_VERSION >= 13500 2468 else if (fs->csrVal) { 2469 /* We have a factorized matrix on device and are able to copy it to host */ 2470 PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2471 } 2472 #endif 2473 else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for copying this type of factorized matrix from device to host"); 2474 PetscCall(PetscLogGpuToCpu(a->nz*sizeof(PetscScalar))); 2475 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0)); 2476 A->offloadmask = PETSC_OFFLOAD_BOTH; 2477 } 2478 PetscFunctionReturn(0); 2479 } 2480 2481 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 2482 { 2483 PetscFunctionBegin; 2484 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2485 *array = ((Mat_SeqAIJ*)A->data)->a; 2486 PetscFunctionReturn(0); 2487 } 2488 2489 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 2490 { 2491 PetscFunctionBegin; 2492 A->offloadmask = PETSC_OFFLOAD_CPU; 2493 *array = NULL; 2494 PetscFunctionReturn(0); 2495 } 2496 2497 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 2498 { 2499 PetscFunctionBegin; 2500 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2501 *array = ((Mat_SeqAIJ*)A->data)->a; 2502 PetscFunctionReturn(0); 2503 } 2504 2505 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 2506 { 2507 PetscFunctionBegin; 2508 *array = NULL; 2509 PetscFunctionReturn(0); 2510 } 2511 2512 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 2513 { 2514 PetscFunctionBegin; 2515 *array = ((Mat_SeqAIJ*)A->data)->a; 2516 PetscFunctionReturn(0); 2517 } 2518 2519 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 2520 { 2521 PetscFunctionBegin; 2522 A->offloadmask = PETSC_OFFLOAD_CPU; 2523 *array = NULL; 2524 PetscFunctionReturn(0); 2525 } 2526 2527 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype) 2528 { 2529 Mat_SeqAIJCUSPARSE *cusp; 2530 CsrMatrix *matrix; 2531 2532 PetscFunctionBegin; 2533 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2534 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 2535 cusp = static_cast<Mat_SeqAIJCUSPARSE*>(A->spptr); 2536 PetscCheck(cusp != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"cusp is NULL"); 2537 matrix = (CsrMatrix*)cusp->mat->mat; 2538 2539 if (i) { 2540 #if !defined(PETSC_USE_64BIT_INDICES) 2541 *i = matrix->row_offsets->data().get(); 2542 #else 2543 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices"); 2544 #endif 2545 } 2546 if (j) { 2547 #if !defined(PETSC_USE_64BIT_INDICES) 2548 *j = matrix->column_indices->data().get(); 2549 #else 2550 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices"); 2551 #endif 2552 } 2553 if (a) *a = matrix->values->data().get(); 2554 if (mtype) *mtype = PETSC_MEMTYPE_CUDA; 2555 PetscFunctionReturn(0); 2556 } 2557 2558 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 2559 { 2560 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2561 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 2562 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2563 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 2564 cusparseStatus_t stat; 2565 PetscBool both = PETSC_TRUE; 2566 2567 PetscFunctionBegin; 2568 PetscCheck(!A->boundtocpu,PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU"); 2569 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 2570 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 2571 CsrMatrix *matrix; 2572 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 2573 2574 PetscCheck(!a->nz || a->a,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values"); 2575 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0)); 2576 matrix->values->assign(a->a, a->a+a->nz); 2577 PetscCallCUDA(WaitForCUDA()); 2578 PetscCall(PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar))); 2579 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0)); 2580 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 2581 } else { 2582 PetscInt nnz; 2583 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0)); 2584 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format)); 2585 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 2586 delete cusparsestruct->workVector; 2587 delete cusparsestruct->rowoffsets_gpu; 2588 cusparsestruct->workVector = NULL; 2589 cusparsestruct->rowoffsets_gpu = NULL; 2590 try { 2591 if (a->compressedrow.use) { 2592 m = a->compressedrow.nrows; 2593 ii = a->compressedrow.i; 2594 ridx = a->compressedrow.rindex; 2595 } else { 2596 m = A->rmap->n; 2597 ii = a->i; 2598 ridx = NULL; 2599 } 2600 PetscCheck(ii,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data"); 2601 if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 2602 else nnz = a->nz; 2603 PetscCheck(!nnz || a->j,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data"); 2604 2605 /* create cusparse matrix */ 2606 cusparsestruct->nrows = m; 2607 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 2608 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr)); 2609 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO)); 2610 PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 2611 2612 PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar))); 2613 PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar))); 2614 PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar))); 2615 PetscCallCUDA(cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 2616 PetscCallCUDA(cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 2617 PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 2618 PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2619 2620 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 2621 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 2622 /* set the matrix */ 2623 CsrMatrix *mat= new CsrMatrix; 2624 mat->num_rows = m; 2625 mat->num_cols = A->cmap->n; 2626 mat->num_entries = nnz; 2627 mat->row_offsets = new THRUSTINTARRAY32(m+1); 2628 mat->row_offsets->assign(ii, ii + m+1); 2629 2630 mat->column_indices = new THRUSTINTARRAY32(nnz); 2631 mat->column_indices->assign(a->j, a->j+nnz); 2632 2633 mat->values = new THRUSTARRAY(nnz); 2634 if (a->a) mat->values->assign(a->a, a->a+nnz); 2635 2636 /* assign the pointer */ 2637 matstruct->mat = mat; 2638 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2639 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 2640 stat = cusparseCreateCsr(&matstruct->matDescr, 2641 mat->num_rows, mat->num_cols, mat->num_entries, 2642 mat->row_offsets->data().get(), mat->column_indices->data().get(), 2643 mat->values->data().get(), 2644 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2645 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat); 2646 } 2647 #endif 2648 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 2649 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2650 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2651 #else 2652 CsrMatrix *mat= new CsrMatrix; 2653 mat->num_rows = m; 2654 mat->num_cols = A->cmap->n; 2655 mat->num_entries = nnz; 2656 mat->row_offsets = new THRUSTINTARRAY32(m+1); 2657 mat->row_offsets->assign(ii, ii + m+1); 2658 2659 mat->column_indices = new THRUSTINTARRAY32(nnz); 2660 mat->column_indices->assign(a->j, a->j+nnz); 2661 2662 mat->values = new THRUSTARRAY(nnz); 2663 if (a->a) mat->values->assign(a->a, a->a+nnz); 2664 2665 cusparseHybMat_t hybMat; 2666 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 2667 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 2668 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 2669 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 2670 matstruct->descr, mat->values->data().get(), 2671 mat->row_offsets->data().get(), 2672 mat->column_indices->data().get(), 2673 hybMat, 0, partition);PetscCallCUSPARSE(stat); 2674 /* assign the pointer */ 2675 matstruct->mat = hybMat; 2676 2677 if (mat) { 2678 if (mat->values) delete (THRUSTARRAY*)mat->values; 2679 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 2680 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 2681 delete (CsrMatrix*)mat; 2682 } 2683 #endif 2684 } 2685 2686 /* assign the compressed row indices */ 2687 if (a->compressedrow.use) { 2688 cusparsestruct->workVector = new THRUSTARRAY(m); 2689 matstruct->cprowIndices = new THRUSTINTARRAY(m); 2690 matstruct->cprowIndices->assign(ridx,ridx+m); 2691 tmp = m; 2692 } else { 2693 cusparsestruct->workVector = NULL; 2694 matstruct->cprowIndices = NULL; 2695 tmp = 0; 2696 } 2697 PetscCall(PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar))); 2698 2699 /* assign the pointer */ 2700 cusparsestruct->mat = matstruct; 2701 } catch(char *ex) { 2702 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 2703 } 2704 PetscCallCUDA(WaitForCUDA()); 2705 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0)); 2706 cusparsestruct->nonzerostate = A->nonzerostate; 2707 } 2708 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 2709 } 2710 PetscFunctionReturn(0); 2711 } 2712 2713 struct VecCUDAPlusEquals 2714 { 2715 template <typename Tuple> 2716 __host__ __device__ 2717 void operator()(Tuple t) 2718 { 2719 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 2720 } 2721 }; 2722 2723 struct VecCUDAEquals 2724 { 2725 template <typename Tuple> 2726 __host__ __device__ 2727 void operator()(Tuple t) 2728 { 2729 thrust::get<1>(t) = thrust::get<0>(t); 2730 } 2731 }; 2732 2733 struct VecCUDAEqualsReverse 2734 { 2735 template <typename Tuple> 2736 __host__ __device__ 2737 void operator()(Tuple t) 2738 { 2739 thrust::get<0>(t) = thrust::get<1>(t); 2740 } 2741 }; 2742 2743 struct MatMatCusparse { 2744 PetscBool cisdense; 2745 PetscScalar *Bt; 2746 Mat X; 2747 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2748 PetscLogDouble flops; 2749 CsrMatrix *Bcsr; 2750 2751 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2752 cusparseSpMatDescr_t matSpBDescr; 2753 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2754 cusparseDnMatDescr_t matBDescr; 2755 cusparseDnMatDescr_t matCDescr; 2756 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 2757 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2758 void *dBuffer4; 2759 void *dBuffer5; 2760 #endif 2761 size_t mmBufferSize; 2762 void *mmBuffer; 2763 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2764 cusparseSpGEMMDescr_t spgemmDesc; 2765 #endif 2766 }; 2767 2768 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2769 { 2770 MatMatCusparse *mmdata = (MatMatCusparse *)data; 2771 2772 PetscFunctionBegin; 2773 PetscCallCUDA(cudaFree(mmdata->Bt)); 2774 delete mmdata->Bcsr; 2775 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2776 if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr)); 2777 if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2778 if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2779 if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2780 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2781 if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4)); 2782 if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5)); 2783 #endif 2784 if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2785 if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2)); 2786 #endif 2787 PetscCall(MatDestroy(&mmdata->X)); 2788 PetscCall(PetscFree(data)); 2789 PetscFunctionReturn(0); 2790 } 2791 2792 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 2793 2794 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2795 { 2796 Mat_Product *product = C->product; 2797 Mat A,B; 2798 PetscInt m,n,blda,clda; 2799 PetscBool flg,biscuda; 2800 Mat_SeqAIJCUSPARSE *cusp; 2801 cusparseStatus_t stat; 2802 cusparseOperation_t opA; 2803 const PetscScalar *barray; 2804 PetscScalar *carray; 2805 MatMatCusparse *mmdata; 2806 Mat_SeqAIJCUSPARSEMultStruct *mat; 2807 CsrMatrix *csrmat; 2808 2809 PetscFunctionBegin; 2810 MatCheckProduct(C,1); 2811 PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2812 mmdata = (MatMatCusparse*)product->data; 2813 A = product->A; 2814 B = product->B; 2815 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2816 PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2817 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2818 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2819 PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2820 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2821 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2822 switch (product->type) { 2823 case MATPRODUCT_AB: 2824 case MATPRODUCT_PtAP: 2825 mat = cusp->mat; 2826 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2827 m = A->rmap->n; 2828 n = B->cmap->n; 2829 break; 2830 case MATPRODUCT_AtB: 2831 if (!A->form_explicit_transpose) { 2832 mat = cusp->mat; 2833 opA = CUSPARSE_OPERATION_TRANSPOSE; 2834 } else { 2835 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2836 mat = cusp->matTranspose; 2837 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2838 } 2839 m = A->cmap->n; 2840 n = B->cmap->n; 2841 break; 2842 case MATPRODUCT_ABt: 2843 case MATPRODUCT_RARt: 2844 mat = cusp->mat; 2845 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2846 m = A->rmap->n; 2847 n = B->rmap->n; 2848 break; 2849 default: 2850 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2851 } 2852 PetscCheck(mat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2853 csrmat = (CsrMatrix*)mat->mat; 2854 /* if the user passed a CPU matrix, copy the data to the GPU */ 2855 PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda)); 2856 if (!biscuda) PetscCall(MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B)); 2857 PetscCall(MatDenseCUDAGetArrayRead(B,&barray)); 2858 2859 PetscCall(MatDenseGetLDA(B,&blda)); 2860 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2861 PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X,&carray)); 2862 PetscCall(MatDenseGetLDA(mmdata->X,&clda)); 2863 } else { 2864 PetscCall(MatDenseCUDAGetArrayWrite(C,&carray)); 2865 PetscCall(MatDenseGetLDA(C,&clda)); 2866 } 2867 2868 PetscCall(PetscLogGpuTimeBegin()); 2869 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2870 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2871 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2872 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2873 size_t mmBufferSize; 2874 if (mmdata->initialized && mmdata->Blda != blda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); mmdata->matBDescr = NULL;} 2875 if (!mmdata->matBDescr) { 2876 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL)); 2877 mmdata->Blda = blda; 2878 } 2879 2880 if (mmdata->initialized && mmdata->Clda != clda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); mmdata->matCDescr = NULL;} 2881 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2882 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL)); 2883 mmdata->Clda = clda; 2884 } 2885 2886 if (!mat->matDescr) { 2887 stat = cusparseCreateCsr(&mat->matDescr, 2888 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2889 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2890 csrmat->values->data().get(), 2891 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2892 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat); 2893 } 2894 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2895 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2896 mmdata->matCDescr,cusparse_scalartype, 2897 cusp->spmmAlg,&mmBufferSize);PetscCallCUSPARSE(stat); 2898 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2899 PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2900 PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer,mmBufferSize)); 2901 mmdata->mmBufferSize = mmBufferSize; 2902 } 2903 mmdata->initialized = PETSC_TRUE; 2904 } else { 2905 /* to be safe, always update pointers of the mats */ 2906 PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get())); 2907 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray)); 2908 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray)); 2909 } 2910 2911 /* do cusparseSpMM, which supports transpose on B */ 2912 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2913 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2914 mmdata->matCDescr,cusparse_scalartype, 2915 cusp->spmmAlg,mmdata->mmBuffer);PetscCallCUSPARSE(stat); 2916 #else 2917 PetscInt k; 2918 /* cusparseXcsrmm does not support transpose on B */ 2919 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2920 cublasHandle_t cublasv2handle; 2921 cublasStatus_t cerr; 2922 2923 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 2924 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2925 B->cmap->n,B->rmap->n, 2926 &PETSC_CUSPARSE_ONE ,barray,blda, 2927 &PETSC_CUSPARSE_ZERO,barray,blda, 2928 mmdata->Bt,B->cmap->n);PetscCallCUBLAS(cerr); 2929 blda = B->cmap->n; 2930 k = B->cmap->n; 2931 } else { 2932 k = B->rmap->n; 2933 } 2934 2935 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2936 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2937 csrmat->num_entries,mat->alpha_one,mat->descr, 2938 csrmat->values->data().get(), 2939 csrmat->row_offsets->data().get(), 2940 csrmat->column_indices->data().get(), 2941 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2942 carray,clda);PetscCallCUSPARSE(stat); 2943 #endif 2944 PetscCall(PetscLogGpuTimeEnd()); 2945 PetscCall(PetscLogGpuFlops(n*2.0*csrmat->num_entries)); 2946 PetscCall(MatDenseCUDARestoreArrayRead(B,&barray)); 2947 if (product->type == MATPRODUCT_RARt) { 2948 PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray)); 2949 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE)); 2950 } else if (product->type == MATPRODUCT_PtAP) { 2951 PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray)); 2952 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE)); 2953 } else { 2954 PetscCall(MatDenseCUDARestoreArrayWrite(C,&carray)); 2955 } 2956 if (mmdata->cisdense) { 2957 PetscCall(MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C)); 2958 } 2959 if (!biscuda) { 2960 PetscCall(MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B)); 2961 } 2962 PetscFunctionReturn(0); 2963 } 2964 2965 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2966 { 2967 Mat_Product *product = C->product; 2968 Mat A,B; 2969 PetscInt m,n; 2970 PetscBool cisdense,flg; 2971 MatMatCusparse *mmdata; 2972 Mat_SeqAIJCUSPARSE *cusp; 2973 2974 PetscFunctionBegin; 2975 MatCheckProduct(C,1); 2976 PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2977 A = product->A; 2978 B = product->B; 2979 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2980 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2981 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2982 PetscCheck(cusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2983 switch (product->type) { 2984 case MATPRODUCT_AB: 2985 m = A->rmap->n; 2986 n = B->cmap->n; 2987 break; 2988 case MATPRODUCT_AtB: 2989 m = A->cmap->n; 2990 n = B->cmap->n; 2991 break; 2992 case MATPRODUCT_ABt: 2993 m = A->rmap->n; 2994 n = B->rmap->n; 2995 break; 2996 case MATPRODUCT_PtAP: 2997 m = B->cmap->n; 2998 n = B->cmap->n; 2999 break; 3000 case MATPRODUCT_RARt: 3001 m = B->rmap->n; 3002 n = B->rmap->n; 3003 break; 3004 default: 3005 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 3006 } 3007 PetscCall(MatSetSizes(C,m,n,m,n)); 3008 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 3009 PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense)); 3010 PetscCall(MatSetType(C,MATSEQDENSECUDA)); 3011 3012 /* product data */ 3013 PetscCall(PetscNew(&mmdata)); 3014 mmdata->cisdense = cisdense; 3015 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 3016 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 3017 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 3018 PetscCallCUDA(cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar))); 3019 } 3020 #endif 3021 /* for these products we need intermediate storage */ 3022 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 3023 PetscCall(MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X)); 3024 PetscCall(MatSetType(mmdata->X,MATSEQDENSECUDA)); 3025 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 3026 PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n)); 3027 } else { 3028 PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n)); 3029 } 3030 } 3031 C->product->data = mmdata; 3032 C->product->destroy = MatDestroy_MatMatCusparse; 3033 3034 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 3035 PetscFunctionReturn(0); 3036 } 3037 3038 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 3039 { 3040 Mat_Product *product = C->product; 3041 Mat A,B; 3042 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 3043 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 3044 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 3045 CsrMatrix *Acsr,*Bcsr,*Ccsr; 3046 PetscBool flg; 3047 cusparseStatus_t stat; 3048 MatProductType ptype; 3049 MatMatCusparse *mmdata; 3050 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3051 cusparseSpMatDescr_t BmatSpDescr; 3052 #endif 3053 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 3054 3055 PetscFunctionBegin; 3056 MatCheckProduct(C,1); 3057 PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 3058 PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg)); 3059 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name); 3060 mmdata = (MatMatCusparse*)C->product->data; 3061 A = product->A; 3062 B = product->B; 3063 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 3064 mmdata->reusesym = PETSC_FALSE; 3065 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 3066 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 3067 Cmat = Ccusp->mat; 3068 PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 3069 Ccsr = (CsrMatrix*)Cmat->mat; 3070 PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 3071 goto finalize; 3072 } 3073 if (!c->nz) goto finalize; 3074 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 3075 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 3076 PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg)); 3077 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 3078 PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 3079 PetscCheck(!B->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 3080 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3081 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 3082 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 3083 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 3084 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 3085 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 3086 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3087 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 3088 3089 ptype = product->type; 3090 if (A->symmetric && ptype == MATPRODUCT_AtB) { 3091 ptype = MATPRODUCT_AB; 3092 PetscCheck(product->symbolic_used_the_fact_A_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric"); 3093 } 3094 if (B->symmetric && ptype == MATPRODUCT_ABt) { 3095 ptype = MATPRODUCT_AB; 3096 PetscCheck(product->symbolic_used_the_fact_B_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric"); 3097 } 3098 switch (ptype) { 3099 case MATPRODUCT_AB: 3100 Amat = Acusp->mat; 3101 Bmat = Bcusp->mat; 3102 break; 3103 case MATPRODUCT_AtB: 3104 Amat = Acusp->matTranspose; 3105 Bmat = Bcusp->mat; 3106 break; 3107 case MATPRODUCT_ABt: 3108 Amat = Acusp->mat; 3109 Bmat = Bcusp->matTranspose; 3110 break; 3111 default: 3112 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 3113 } 3114 Cmat = Ccusp->mat; 3115 PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 3116 PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 3117 PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 3118 Acsr = (CsrMatrix*)Amat->mat; 3119 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 3120 Ccsr = (CsrMatrix*)Cmat->mat; 3121 PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 3122 PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 3123 PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 3124 PetscCall(PetscLogGpuTimeBegin()); 3125 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3126 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 3127 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3128 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3129 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 3130 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3131 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 3132 mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 3133 #else 3134 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 3135 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3136 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 3137 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat); 3138 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 3139 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3140 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 3141 #endif 3142 #else 3143 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 3144 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 3145 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 3146 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 3147 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat); 3148 #endif 3149 PetscCall(PetscLogGpuFlops(mmdata->flops)); 3150 PetscCallCUDA(WaitForCUDA()); 3151 PetscCall(PetscLogGpuTimeEnd()); 3152 C->offloadmask = PETSC_OFFLOAD_GPU; 3153 finalize: 3154 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3155 PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz)); 3156 PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n")); 3157 PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax)); 3158 c->reallocs = 0; 3159 C->info.mallocs += 0; 3160 C->info.nz_unneeded = 0; 3161 C->assembled = C->was_assembled = PETSC_TRUE; 3162 C->num_ass++; 3163 PetscFunctionReturn(0); 3164 } 3165 3166 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 3167 { 3168 Mat_Product *product = C->product; 3169 Mat A,B; 3170 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 3171 Mat_SeqAIJ *a,*b,*c; 3172 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 3173 CsrMatrix *Acsr,*Bcsr,*Ccsr; 3174 PetscInt i,j,m,n,k; 3175 PetscBool flg; 3176 cusparseStatus_t stat; 3177 MatProductType ptype; 3178 MatMatCusparse *mmdata; 3179 PetscLogDouble flops; 3180 PetscBool biscompressed,ciscompressed; 3181 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3182 int64_t C_num_rows1, C_num_cols1, C_nnz1; 3183 cusparseSpMatDescr_t BmatSpDescr; 3184 #else 3185 int cnz; 3186 #endif 3187 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 3188 3189 PetscFunctionBegin; 3190 MatCheckProduct(C,1); 3191 PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 3192 A = product->A; 3193 B = product->B; 3194 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 3195 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 3196 PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg)); 3197 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 3198 a = (Mat_SeqAIJ*)A->data; 3199 b = (Mat_SeqAIJ*)B->data; 3200 /* product data */ 3201 PetscCall(PetscNew(&mmdata)); 3202 C->product->data = mmdata; 3203 C->product->destroy = MatDestroy_MatMatCusparse; 3204 3205 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3206 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 3207 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 3208 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 3209 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 3210 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 3211 3212 ptype = product->type; 3213 if (A->symmetric && ptype == MATPRODUCT_AtB) { 3214 ptype = MATPRODUCT_AB; 3215 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 3216 } 3217 if (B->symmetric && ptype == MATPRODUCT_ABt) { 3218 ptype = MATPRODUCT_AB; 3219 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 3220 } 3221 biscompressed = PETSC_FALSE; 3222 ciscompressed = PETSC_FALSE; 3223 switch (ptype) { 3224 case MATPRODUCT_AB: 3225 m = A->rmap->n; 3226 n = B->cmap->n; 3227 k = A->cmap->n; 3228 Amat = Acusp->mat; 3229 Bmat = Bcusp->mat; 3230 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3231 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3232 break; 3233 case MATPRODUCT_AtB: 3234 m = A->cmap->n; 3235 n = B->cmap->n; 3236 k = A->rmap->n; 3237 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3238 Amat = Acusp->matTranspose; 3239 Bmat = Bcusp->mat; 3240 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3241 break; 3242 case MATPRODUCT_ABt: 3243 m = A->rmap->n; 3244 n = B->rmap->n; 3245 k = A->cmap->n; 3246 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 3247 Amat = Acusp->mat; 3248 Bmat = Bcusp->matTranspose; 3249 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3250 break; 3251 default: 3252 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 3253 } 3254 3255 /* create cusparse matrix */ 3256 PetscCall(MatSetSizes(C,m,n,m,n)); 3257 PetscCall(MatSetType(C,MATSEQAIJCUSPARSE)); 3258 c = (Mat_SeqAIJ*)C->data; 3259 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 3260 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3261 Ccsr = new CsrMatrix; 3262 3263 c->compressedrow.use = ciscompressed; 3264 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 3265 c->compressedrow.nrows = a->compressedrow.nrows; 3266 PetscCall(PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex)); 3267 PetscCall(PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows)); 3268 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 3269 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 3270 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 3271 } else { 3272 c->compressedrow.nrows = 0; 3273 c->compressedrow.i = NULL; 3274 c->compressedrow.rindex = NULL; 3275 Ccusp->workVector = NULL; 3276 Cmat->cprowIndices = NULL; 3277 } 3278 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 3279 Ccusp->mat = Cmat; 3280 Ccusp->mat->mat = Ccsr; 3281 Ccsr->num_rows = Ccusp->nrows; 3282 Ccsr->num_cols = n; 3283 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 3284 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 3285 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 3286 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 3287 PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar))); 3288 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar))); 3289 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 3290 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 3291 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 3292 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 3293 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 3294 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 3295 c->nz = 0; 3296 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3297 Ccsr->values = new THRUSTARRAY(c->nz); 3298 goto finalizesym; 3299 } 3300 3301 PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 3302 PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 3303 Acsr = (CsrMatrix*)Amat->mat; 3304 if (!biscompressed) { 3305 Bcsr = (CsrMatrix*)Bmat->mat; 3306 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3307 BmatSpDescr = Bmat->matDescr; 3308 #endif 3309 } else { /* we need to use row offsets for the full matrix */ 3310 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 3311 Bcsr = new CsrMatrix; 3312 Bcsr->num_rows = B->rmap->n; 3313 Bcsr->num_cols = cBcsr->num_cols; 3314 Bcsr->num_entries = cBcsr->num_entries; 3315 Bcsr->column_indices = cBcsr->column_indices; 3316 Bcsr->values = cBcsr->values; 3317 if (!Bcusp->rowoffsets_gpu) { 3318 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3319 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 3320 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt))); 3321 } 3322 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 3323 mmdata->Bcsr = Bcsr; 3324 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3325 if (Bcsr->num_rows && Bcsr->num_cols) { 3326 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 3327 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 3328 Bcsr->values->data().get(), 3329 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 3330 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 3331 } 3332 BmatSpDescr = mmdata->matSpBDescr; 3333 #endif 3334 } 3335 PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 3336 PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 3337 /* precompute flops count */ 3338 if (ptype == MATPRODUCT_AB) { 3339 for (i=0, flops = 0; i<A->rmap->n; i++) { 3340 const PetscInt st = a->i[i]; 3341 const PetscInt en = a->i[i+1]; 3342 for (j=st; j<en; j++) { 3343 const PetscInt brow = a->j[j]; 3344 flops += 2.*(b->i[brow+1] - b->i[brow]); 3345 } 3346 } 3347 } else if (ptype == MATPRODUCT_AtB) { 3348 for (i=0, flops = 0; i<A->rmap->n; i++) { 3349 const PetscInt anzi = a->i[i+1] - a->i[i]; 3350 const PetscInt bnzi = b->i[i+1] - b->i[i]; 3351 flops += (2.*anzi)*bnzi; 3352 } 3353 } else { /* TODO */ 3354 flops = 0.; 3355 } 3356 3357 mmdata->flops = flops; 3358 PetscCall(PetscLogGpuTimeBegin()); 3359 3360 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3361 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3362 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 3363 NULL, NULL, NULL, 3364 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 3365 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 3366 PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 3367 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3368 { 3369 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 3370 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 3371 */ 3372 void* dBuffer1 = NULL; 3373 void* dBuffer2 = NULL; 3374 void* dBuffer3 = NULL; 3375 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 3376 size_t bufferSize1 = 0; 3377 size_t bufferSize2 = 0; 3378 size_t bufferSize3 = 0; 3379 size_t bufferSize4 = 0; 3380 size_t bufferSize5 = 0; 3381 3382 /*----------------------------------------------------------------------*/ 3383 /* ask bufferSize1 bytes for external memory */ 3384 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 3385 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 3386 &bufferSize1, NULL);PetscCallCUSPARSE(stat); 3387 PetscCallCUDA(cudaMalloc((void**) &dBuffer1, bufferSize1)); 3388 /* inspect the matrices A and B to understand the memory requirement for the next step */ 3389 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 3390 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 3391 &bufferSize1, dBuffer1);PetscCallCUSPARSE(stat); 3392 3393 /*----------------------------------------------------------------------*/ 3394 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 3395 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 3396 &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);PetscCallCUSPARSE(stat); 3397 PetscCallCUDA(cudaMalloc((void**) &dBuffer2, bufferSize2)); 3398 PetscCallCUDA(cudaMalloc((void**) &dBuffer3, bufferSize3)); 3399 PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4)); 3400 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 3401 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 3402 &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);PetscCallCUSPARSE(stat); 3403 PetscCallCUDA(cudaFree(dBuffer1)); 3404 PetscCallCUDA(cudaFree(dBuffer2)); 3405 3406 /*----------------------------------------------------------------------*/ 3407 /* get matrix C non-zero entries C_nnz1 */ 3408 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3409 c->nz = (PetscInt) C_nnz1; 3410 /* allocate matrix C */ 3411 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3412 Ccsr->values = new THRUSTARRAY(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3413 /* update matC with the new pointers */ 3414 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 3415 Ccsr->values->data().get());PetscCallCUSPARSE(stat); 3416 3417 /*----------------------------------------------------------------------*/ 3418 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 3419 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 3420 &bufferSize5, NULL);PetscCallCUSPARSE(stat); 3421 PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5)); 3422 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 3423 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 3424 &bufferSize5, mmdata->dBuffer5);PetscCallCUSPARSE(stat); 3425 PetscCallCUDA(cudaFree(dBuffer3)); 3426 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 3427 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3428 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 3429 mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 3430 PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024)); 3431 } 3432 #else 3433 size_t bufSize2; 3434 /* ask bufferSize bytes for external memory */ 3435 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 3436 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3437 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 3438 mmdata->spgemmDesc, &bufSize2, NULL);PetscCallCUSPARSE(stat); 3439 PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2)); 3440 /* inspect the matrices A and B to understand the memory requirement for the next step */ 3441 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 3442 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3443 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 3444 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);PetscCallCUSPARSE(stat); 3445 /* ask bufferSize again bytes for external memory */ 3446 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 3447 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3448 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 3449 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);PetscCallCUSPARSE(stat); 3450 /* The CUSPARSE documentation is not clear, nor the API 3451 We need both buffers to perform the operations properly! 3452 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 3453 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 3454 is stored in the descriptor! What a messy API... */ 3455 PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize)); 3456 /* compute the intermediate product of A * B */ 3457 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 3458 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3459 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 3460 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat); 3461 /* get matrix C non-zero entries C_nnz1 */ 3462 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3463 c->nz = (PetscInt) C_nnz1; 3464 PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024)); 3465 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3466 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3467 Ccsr->values = new THRUSTARRAY(c->nz); 3468 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3469 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 3470 Ccsr->values->data().get());PetscCallCUSPARSE(stat); 3471 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 3472 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3473 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 3474 #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3475 #else 3476 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST)); 3477 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, 3478 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 3479 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 3480 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 3481 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);PetscCallCUSPARSE(stat); 3482 c->nz = cnz; 3483 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3484 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3485 Ccsr->values = new THRUSTARRAY(c->nz); 3486 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3487 3488 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3489 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 3490 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 3491 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 3492 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 3493 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 3494 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 3495 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 3496 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat); 3497 #endif 3498 PetscCall(PetscLogGpuFlops(mmdata->flops)); 3499 PetscCall(PetscLogGpuTimeEnd()); 3500 finalizesym: 3501 c->singlemalloc = PETSC_FALSE; 3502 c->free_a = PETSC_TRUE; 3503 c->free_ij = PETSC_TRUE; 3504 PetscCall(PetscMalloc1(m+1,&c->i)); 3505 PetscCall(PetscMalloc1(c->nz,&c->j)); 3506 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 3507 PetscInt *d_i = c->i; 3508 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 3509 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 3510 ii = *Ccsr->row_offsets; 3511 jj = *Ccsr->column_indices; 3512 if (ciscompressed) d_i = c->compressedrow.i; 3513 PetscCallCUDA(cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 3514 PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 3515 } else { 3516 PetscInt *d_i = c->i; 3517 if (ciscompressed) d_i = c->compressedrow.i; 3518 PetscCallCUDA(cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 3519 PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 3520 } 3521 if (ciscompressed) { /* need to expand host row offsets */ 3522 PetscInt r = 0; 3523 c->i[0] = 0; 3524 for (k = 0; k < c->compressedrow.nrows; k++) { 3525 const PetscInt next = c->compressedrow.rindex[k]; 3526 const PetscInt old = c->compressedrow.i[k]; 3527 for (; r < next; r++) c->i[r+1] = old; 3528 } 3529 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 3530 } 3531 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt))); 3532 PetscCall(PetscMalloc1(m,&c->ilen)); 3533 PetscCall(PetscMalloc1(m,&c->imax)); 3534 c->maxnz = c->nz; 3535 c->nonzerorowcnt = 0; 3536 c->rmax = 0; 3537 for (k = 0; k < m; k++) { 3538 const PetscInt nn = c->i[k+1] - c->i[k]; 3539 c->ilen[k] = c->imax[k] = nn; 3540 c->nonzerorowcnt += (PetscInt)!!nn; 3541 c->rmax = PetscMax(c->rmax,nn); 3542 } 3543 PetscCall(MatMarkDiagonal_SeqAIJ(C)); 3544 PetscCall(PetscMalloc1(c->nz,&c->a)); 3545 Ccsr->num_entries = c->nz; 3546 3547 C->nonzerostate++; 3548 PetscCall(PetscLayoutSetUp(C->rmap)); 3549 PetscCall(PetscLayoutSetUp(C->cmap)); 3550 Ccusp->nonzerostate = C->nonzerostate; 3551 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3552 C->preallocated = PETSC_TRUE; 3553 C->assembled = PETSC_FALSE; 3554 C->was_assembled = PETSC_FALSE; 3555 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 3556 mmdata->reusesym = PETSC_TRUE; 3557 C->offloadmask = PETSC_OFFLOAD_GPU; 3558 } 3559 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3560 PetscFunctionReturn(0); 3561 } 3562 3563 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 3564 3565 /* handles sparse or dense B */ 3566 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 3567 { 3568 Mat_Product *product = mat->product; 3569 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 3570 3571 PetscFunctionBegin; 3572 MatCheckProduct(mat,1); 3573 PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense)); 3574 if (!product->A->boundtocpu && !product->B->boundtocpu) { 3575 PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp)); 3576 } 3577 if (product->type == MATPRODUCT_ABC) { 3578 Ciscusp = PETSC_FALSE; 3579 if (!product->C->boundtocpu) { 3580 PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp)); 3581 } 3582 } 3583 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 3584 PetscBool usecpu = PETSC_FALSE; 3585 switch (product->type) { 3586 case MATPRODUCT_AB: 3587 if (product->api_user) { 3588 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat"); 3589 PetscCall(PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL)); 3590 PetscOptionsEnd(); 3591 } else { 3592 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat"); 3593 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL)); 3594 PetscOptionsEnd(); 3595 } 3596 break; 3597 case MATPRODUCT_AtB: 3598 if (product->api_user) { 3599 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat"); 3600 PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL)); 3601 PetscOptionsEnd(); 3602 } else { 3603 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat"); 3604 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL)); 3605 PetscOptionsEnd(); 3606 } 3607 break; 3608 case MATPRODUCT_PtAP: 3609 if (product->api_user) { 3610 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat"); 3611 PetscCall(PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL)); 3612 PetscOptionsEnd(); 3613 } else { 3614 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat"); 3615 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL)); 3616 PetscOptionsEnd(); 3617 } 3618 break; 3619 case MATPRODUCT_RARt: 3620 if (product->api_user) { 3621 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat"); 3622 PetscCall(PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL)); 3623 PetscOptionsEnd(); 3624 } else { 3625 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat"); 3626 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL)); 3627 PetscOptionsEnd(); 3628 } 3629 break; 3630 case MATPRODUCT_ABC: 3631 if (product->api_user) { 3632 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat"); 3633 PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL)); 3634 PetscOptionsEnd(); 3635 } else { 3636 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat"); 3637 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL)); 3638 PetscOptionsEnd(); 3639 } 3640 break; 3641 default: 3642 break; 3643 } 3644 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 3645 } 3646 /* dispatch */ 3647 if (isdense) { 3648 switch (product->type) { 3649 case MATPRODUCT_AB: 3650 case MATPRODUCT_AtB: 3651 case MATPRODUCT_ABt: 3652 case MATPRODUCT_PtAP: 3653 case MATPRODUCT_RARt: 3654 if (product->A->boundtocpu) { 3655 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 3656 } else { 3657 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 3658 } 3659 break; 3660 case MATPRODUCT_ABC: 3661 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3662 break; 3663 default: 3664 break; 3665 } 3666 } else if (Biscusp && Ciscusp) { 3667 switch (product->type) { 3668 case MATPRODUCT_AB: 3669 case MATPRODUCT_AtB: 3670 case MATPRODUCT_ABt: 3671 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3672 break; 3673 case MATPRODUCT_PtAP: 3674 case MATPRODUCT_RARt: 3675 case MATPRODUCT_ABC: 3676 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3677 break; 3678 default: 3679 break; 3680 } 3681 } else { /* fallback for AIJ */ 3682 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 3683 } 3684 PetscFunctionReturn(0); 3685 } 3686 3687 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 3688 { 3689 PetscFunctionBegin; 3690 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE)); 3691 PetscFunctionReturn(0); 3692 } 3693 3694 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 3695 { 3696 PetscFunctionBegin; 3697 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE)); 3698 PetscFunctionReturn(0); 3699 } 3700 3701 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 3702 { 3703 PetscFunctionBegin; 3704 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE)); 3705 PetscFunctionReturn(0); 3706 } 3707 3708 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3709 { 3710 PetscFunctionBegin; 3711 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE)); 3712 PetscFunctionReturn(0); 3713 } 3714 3715 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 3716 { 3717 PetscFunctionBegin; 3718 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE)); 3719 PetscFunctionReturn(0); 3720 } 3721 3722 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 3723 { 3724 int i = blockIdx.x*blockDim.x + threadIdx.x; 3725 if (i < n) y[idx[i]] += x[i]; 3726 } 3727 3728 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 3729 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 3730 { 3731 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3732 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 3733 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3734 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 3735 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3736 PetscBool compressed; 3737 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3738 PetscInt nx,ny; 3739 #endif 3740 3741 PetscFunctionBegin; 3742 PetscCheck(!herm || trans,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported"); 3743 if (!a->nz) { 3744 if (!yy) PetscCall(VecSet_SeqCUDA(zz,0)); 3745 else PetscCall(VecCopy_SeqCUDA(yy,zz)); 3746 PetscFunctionReturn(0); 3747 } 3748 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 3749 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3750 if (!trans) { 3751 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3752 PetscCheck(matstruct,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3753 } else { 3754 if (herm || !A->form_explicit_transpose) { 3755 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3756 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3757 } else { 3758 if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3759 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 3760 } 3761 } 3762 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3763 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3764 3765 try { 3766 PetscCall(VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray)); 3767 if (yy == zz) PetscCall(VecCUDAGetArray(zz,&zarray)); /* read & write zz, so need to get uptodate zarray on GPU */ 3768 else PetscCall(VecCUDAGetArrayWrite(zz,&zarray)); /* write zz, so no need to init zarray on GPU */ 3769 3770 PetscCall(PetscLogGpuTimeBegin()); 3771 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3772 /* z = A x + beta y. 3773 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3774 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3775 */ 3776 xptr = xarray; 3777 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3778 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3779 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3780 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3781 allocated to accommodate different uses. So we get the length info directly from mat. 3782 */ 3783 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3784 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3785 nx = mat->num_cols; 3786 ny = mat->num_rows; 3787 } 3788 #endif 3789 } else { 3790 /* z = A^T x + beta y 3791 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3792 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3793 */ 3794 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3795 dptr = zarray; 3796 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3797 if (compressed) { /* Scatter x to work vector */ 3798 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3799 thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3800 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3801 VecCUDAEqualsReverse()); 3802 } 3803 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3804 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3805 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3806 nx = mat->num_rows; 3807 ny = mat->num_cols; 3808 } 3809 #endif 3810 } 3811 3812 /* csr_spmv does y = alpha op(A) x + beta y */ 3813 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3814 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3815 PetscCheck(opA >= 0 && opA <= 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3816 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3817 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype)); 3818 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype)); 3819 PetscCallCUSPARSE(cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 3820 matstruct->matDescr, 3821 matstruct->cuSpMV[opA].vecXDescr, beta, 3822 matstruct->cuSpMV[opA].vecYDescr, 3823 cusparse_scalartype, 3824 cusparsestruct->spmvAlg, 3825 &matstruct->cuSpMV[opA].spmvBufferSize)); 3826 PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize)); 3827 3828 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3829 } else { 3830 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3831 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr)); 3832 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr)); 3833 } 3834 3835 PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, 3836 matstruct->alpha_one, 3837 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 3838 matstruct->cuSpMV[opA].vecXDescr, 3839 beta, 3840 matstruct->cuSpMV[opA].vecYDescr, 3841 cusparse_scalartype, 3842 cusparsestruct->spmvAlg, 3843 matstruct->cuSpMV[opA].spmvBuffer)); 3844 #else 3845 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3846 PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, 3847 mat->num_rows, mat->num_cols, 3848 mat->num_entries, matstruct->alpha_one, matstruct->descr, 3849 mat->values->data().get(), mat->row_offsets->data().get(), 3850 mat->column_indices->data().get(), xptr, beta, 3851 dptr)); 3852 #endif 3853 } else { 3854 if (cusparsestruct->nrows) { 3855 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3856 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3857 #else 3858 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3859 PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, 3860 matstruct->alpha_one, matstruct->descr, hybMat, 3861 xptr, beta, 3862 dptr)); 3863 #endif 3864 } 3865 } 3866 PetscCall(PetscLogGpuTimeEnd()); 3867 3868 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3869 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3870 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3871 PetscCall(VecCopy_SeqCUDA(yy,zz)); /* zz = yy */ 3872 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3873 PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */ 3874 } 3875 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3876 PetscCall(VecSet_SeqCUDA(zz,0)); 3877 } 3878 3879 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3880 if (compressed) { 3881 PetscCall(PetscLogGpuTimeBegin()); 3882 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3883 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3884 prevent that. So I just add a ScatterAdd kernel. 3885 */ 3886 #if 0 3887 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3888 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3889 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3890 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3891 VecCUDAPlusEquals()); 3892 #else 3893 PetscInt n = matstruct->cprowIndices->size(); 3894 ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 3895 #endif 3896 PetscCall(PetscLogGpuTimeEnd()); 3897 } 3898 } else { 3899 if (yy && yy != zz) { 3900 PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */ 3901 } 3902 } 3903 PetscCall(VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray)); 3904 if (yy == zz) PetscCall(VecCUDARestoreArray(zz,&zarray)); 3905 else PetscCall(VecCUDARestoreArrayWrite(zz,&zarray)); 3906 } catch(char *ex) { 3907 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 3908 } 3909 if (yy) { 3910 PetscCall(PetscLogGpuFlops(2.0*a->nz)); 3911 } else { 3912 PetscCall(PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt)); 3913 } 3914 PetscFunctionReturn(0); 3915 } 3916 3917 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3918 { 3919 PetscFunctionBegin; 3920 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE)); 3921 PetscFunctionReturn(0); 3922 } 3923 3924 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 3925 { 3926 PetscObjectState onnz = A->nonzerostate; 3927 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3928 3929 PetscFunctionBegin; 3930 PetscCall(MatAssemblyEnd_SeqAIJ(A,mode)); 3931 if (onnz != A->nonzerostate && cusp->deviceMat) { 3932 3933 PetscCall(PetscInfo(A,"Destroy device mat since nonzerostate changed\n")); 3934 PetscCallCUDA(cudaFree(cusp->deviceMat)); 3935 cusp->deviceMat = NULL; 3936 } 3937 PetscFunctionReturn(0); 3938 } 3939 3940 /* --------------------------------------------------------------------------------*/ 3941 /*@ 3942 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 3943 (the default parallel PETSc format). This matrix will ultimately pushed down 3944 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 3945 assembly performance the user should preallocate the matrix storage by setting 3946 the parameter nz (or the array nnz). By setting these parameters accurately, 3947 performance during matrix assembly can be increased by more than a factor of 50. 3948 3949 Collective 3950 3951 Input Parameters: 3952 + comm - MPI communicator, set to PETSC_COMM_SELF 3953 . m - number of rows 3954 . n - number of columns 3955 . nz - number of nonzeros per row (same for all rows) 3956 - nnz - array containing the number of nonzeros in the various rows 3957 (possibly different for each row) or NULL 3958 3959 Output Parameter: 3960 . A - the matrix 3961 3962 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 3963 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3964 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 3965 3966 Notes: 3967 If nnz is given then nz is ignored 3968 3969 The AIJ format (also called the Yale sparse matrix format or 3970 compressed row storage), is fully compatible with standard Fortran 77 3971 storage. That is, the stored row and column indices can begin at 3972 either one (as in Fortran) or zero. See the users' manual for details. 3973 3974 Specify the preallocated storage with either nz or nnz (not both). 3975 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 3976 allocation. For large problems you MUST preallocate memory or you 3977 will get TERRIBLE performance, see the users' manual chapter on matrices. 3978 3979 By default, this format uses inodes (identical nodes) when possible, to 3980 improve numerical efficiency of matrix-vector products and solves. We 3981 search for consecutive rows with the same nonzero structure, thereby 3982 reusing matrix information to achieve increased efficiency. 3983 3984 Level: intermediate 3985 3986 .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE` 3987 @*/ 3988 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 3989 { 3990 PetscFunctionBegin; 3991 PetscCall(MatCreate(comm,A)); 3992 PetscCall(MatSetSizes(*A,m,n,m,n)); 3993 PetscCall(MatSetType(*A,MATSEQAIJCUSPARSE)); 3994 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz)); 3995 PetscFunctionReturn(0); 3996 } 3997 3998 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3999 { 4000 PetscFunctionBegin; 4001 if (A->factortype == MAT_FACTOR_NONE) { 4002 PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr)); 4003 } else { 4004 PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr)); 4005 } 4006 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL)); 4007 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL)); 4008 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL)); 4009 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL)); 4010 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL)); 4011 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL)); 4012 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL)); 4013 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 4014 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 4015 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL)); 4016 PetscCall(MatDestroy_SeqAIJ(A)); 4017 PetscFunctionReturn(0); 4018 } 4019 4020 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 4021 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 4022 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 4023 { 4024 PetscFunctionBegin; 4025 PetscCall(MatDuplicate_SeqAIJ(A,cpvalues,B)); 4026 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B)); 4027 PetscFunctionReturn(0); 4028 } 4029 4030 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 4031 { 4032 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 4033 Mat_SeqAIJCUSPARSE *cy; 4034 Mat_SeqAIJCUSPARSE *cx; 4035 PetscScalar *ay; 4036 const PetscScalar *ax; 4037 CsrMatrix *csry,*csrx; 4038 4039 PetscFunctionBegin; 4040 cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 4041 cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 4042 if (X->ops->axpy != Y->ops->axpy) { 4043 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE)); 4044 PetscCall(MatAXPY_SeqAIJ(Y,a,X,str)); 4045 PetscFunctionReturn(0); 4046 } 4047 /* if we are here, it means both matrices are bound to GPU */ 4048 PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y)); 4049 PetscCall(MatSeqAIJCUSPARSECopyToGPU(X)); 4050 PetscCheck(cy->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 4051 PetscCheck(cx->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 4052 csry = (CsrMatrix*)cy->mat->mat; 4053 csrx = (CsrMatrix*)cx->mat->mat; 4054 /* see if we can turn this into a cublas axpy */ 4055 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 4056 bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 4057 if (eq) { 4058 eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 4059 } 4060 if (eq) str = SAME_NONZERO_PATTERN; 4061 } 4062 /* spgeam is buggy with one column */ 4063 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 4064 4065 if (str == SUBSET_NONZERO_PATTERN) { 4066 PetscScalar b = 1.0; 4067 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4068 size_t bufferSize; 4069 void *buffer; 4070 #endif 4071 4072 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax)); 4073 PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 4074 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST)); 4075 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4076 PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 4077 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 4078 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 4079 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize)); 4080 PetscCallCUDA(cudaMalloc(&buffer,bufferSize)); 4081 PetscCall(PetscLogGpuTimeBegin()); 4082 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 4083 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 4084 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 4085 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer)); 4086 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 4087 PetscCall(PetscLogGpuTimeEnd()); 4088 PetscCallCUDA(cudaFree(buffer)); 4089 #else 4090 PetscCall(PetscLogGpuTimeBegin()); 4091 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 4092 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 4093 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 4094 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get())); 4095 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 4096 PetscCall(PetscLogGpuTimeEnd()); 4097 #endif 4098 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE)); 4099 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax)); 4100 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 4101 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 4102 } else if (str == SAME_NONZERO_PATTERN) { 4103 cublasHandle_t cublasv2handle; 4104 PetscBLASInt one = 1, bnz = 1; 4105 4106 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax)); 4107 PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 4108 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 4109 PetscCall(PetscBLASIntCast(x->nz,&bnz)); 4110 PetscCall(PetscLogGpuTimeBegin()); 4111 PetscCallCUBLAS(cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one)); 4112 PetscCall(PetscLogGpuFlops(2.0*bnz)); 4113 PetscCall(PetscLogGpuTimeEnd()); 4114 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax)); 4115 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 4116 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 4117 } else { 4118 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE)); 4119 PetscCall(MatAXPY_SeqAIJ(Y,a,X,str)); 4120 } 4121 PetscFunctionReturn(0); 4122 } 4123 4124 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 4125 { 4126 Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 4127 PetscScalar *ay; 4128 cublasHandle_t cublasv2handle; 4129 PetscBLASInt one = 1, bnz = 1; 4130 4131 PetscFunctionBegin; 4132 PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 4133 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 4134 PetscCall(PetscBLASIntCast(y->nz,&bnz)); 4135 PetscCall(PetscLogGpuTimeBegin()); 4136 PetscCallCUBLAS(cublasXscal(cublasv2handle,bnz,&a,ay,one)); 4137 PetscCall(PetscLogGpuFlops(bnz)); 4138 PetscCall(PetscLogGpuTimeEnd()); 4139 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 4140 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 4141 PetscFunctionReturn(0); 4142 } 4143 4144 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 4145 { 4146 PetscBool both = PETSC_FALSE; 4147 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4148 4149 PetscFunctionBegin; 4150 if (A->factortype == MAT_FACTOR_NONE) { 4151 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 4152 if (spptr->mat) { 4153 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 4154 if (matrix->values) { 4155 both = PETSC_TRUE; 4156 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 4157 } 4158 } 4159 if (spptr->matTranspose) { 4160 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 4161 if (matrix->values) { 4162 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 4163 } 4164 } 4165 } 4166 PetscCall(PetscArrayzero(a->a,a->i[A->rmap->n])); 4167 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4168 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 4169 else A->offloadmask = PETSC_OFFLOAD_CPU; 4170 PetscFunctionReturn(0); 4171 } 4172 4173 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 4174 { 4175 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4176 4177 PetscFunctionBegin; 4178 if (A->factortype != MAT_FACTOR_NONE) { 4179 A->boundtocpu = flg; 4180 PetscFunctionReturn(0); 4181 } 4182 if (flg) { 4183 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 4184 4185 A->ops->scale = MatScale_SeqAIJ; 4186 A->ops->axpy = MatAXPY_SeqAIJ; 4187 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 4188 A->ops->mult = MatMult_SeqAIJ; 4189 A->ops->multadd = MatMultAdd_SeqAIJ; 4190 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 4191 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 4192 A->ops->multhermitiantranspose = NULL; 4193 A->ops->multhermitiantransposeadd = NULL; 4194 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 4195 PetscCall(PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps))); 4196 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL)); 4197 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL)); 4198 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL)); 4199 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 4200 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 4201 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL)); 4202 } else { 4203 A->ops->scale = MatScale_SeqAIJCUSPARSE; 4204 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 4205 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 4206 A->ops->mult = MatMult_SeqAIJCUSPARSE; 4207 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 4208 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 4209 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 4210 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 4211 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 4212 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 4213 a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 4214 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 4215 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 4216 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 4217 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 4218 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 4219 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE; 4220 4221 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE)); 4222 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 4223 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 4224 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE)); 4225 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE)); 4226 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 4227 } 4228 A->boundtocpu = flg; 4229 if (flg && a->inode.size) { 4230 a->inode.use = PETSC_TRUE; 4231 } else { 4232 a->inode.use = PETSC_FALSE; 4233 } 4234 PetscFunctionReturn(0); 4235 } 4236 4237 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 4238 { 4239 Mat B; 4240 4241 PetscFunctionBegin; 4242 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */ 4243 if (reuse == MAT_INITIAL_MATRIX) { 4244 PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat)); 4245 } else if (reuse == MAT_REUSE_MATRIX) { 4246 PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN)); 4247 } 4248 B = *newmat; 4249 4250 PetscCall(PetscFree(B->defaultvectype)); 4251 PetscCall(PetscStrallocpy(VECCUDA,&B->defaultvectype)); 4252 4253 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 4254 if (B->factortype == MAT_FACTOR_NONE) { 4255 Mat_SeqAIJCUSPARSE *spptr; 4256 PetscCall(PetscNew(&spptr)); 4257 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 4258 PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream)); 4259 spptr->format = MAT_CUSPARSE_CSR; 4260 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4261 #if CUSPARSE_VERSION > 11301 4262 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 4263 #else 4264 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 4265 #endif 4266 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 4267 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 4268 #endif 4269 B->spptr = spptr; 4270 } else { 4271 Mat_SeqAIJCUSPARSETriFactors *spptr; 4272 4273 PetscCall(PetscNew(&spptr)); 4274 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 4275 PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream)); 4276 B->spptr = spptr; 4277 } 4278 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 4279 } 4280 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 4281 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 4282 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 4283 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 4284 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 4285 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 4286 4287 PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE)); 4288 PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE)); 4289 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE)); 4290 #if defined(PETSC_HAVE_HYPRE) 4291 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE)); 4292 #endif 4293 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE)); 4294 PetscFunctionReturn(0); 4295 } 4296 4297 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 4298 { 4299 PetscFunctionBegin; 4300 PetscCall(MatCreate_SeqAIJ(B)); 4301 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B)); 4302 PetscFunctionReturn(0); 4303 } 4304 4305 /*MC 4306 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 4307 4308 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 4309 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 4310 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 4311 4312 Options Database Keys: 4313 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 4314 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 4315 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 4316 + -mat_cusparse_use_cpu_solve - Do MatSolve on CPU 4317 4318 Level: beginner 4319 4320 .seealso: `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 4321 M*/ 4322 4323 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 4324 4325 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 4326 { 4327 PetscFunctionBegin; 4328 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band)); 4329 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse)); 4330 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse)); 4331 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse)); 4332 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse)); 4333 4334 PetscFunctionReturn(0); 4335 } 4336 4337 static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat) 4338 { 4339 Mat_SeqAIJCUSPARSE* cusp = (Mat_SeqAIJCUSPARSE*)mat->spptr; 4340 4341 PetscFunctionBegin; 4342 if (!cusp) PetscFunctionReturn(0); 4343 delete cusp->cooPerm; 4344 delete cusp->cooPerm_a; 4345 cusp->cooPerm = NULL; 4346 cusp->cooPerm_a = NULL; 4347 if (cusp->use_extended_coo) { 4348 PetscCallCUDA(cudaFree(cusp->jmap_d)); 4349 PetscCallCUDA(cudaFree(cusp->perm_d)); 4350 } 4351 cusp->use_extended_coo = PETSC_FALSE; 4352 PetscFunctionReturn(0); 4353 } 4354 4355 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 4356 { 4357 PetscFunctionBegin; 4358 if (*cusparsestruct) { 4359 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format)); 4360 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format)); 4361 delete (*cusparsestruct)->workVector; 4362 delete (*cusparsestruct)->rowoffsets_gpu; 4363 delete (*cusparsestruct)->cooPerm; 4364 delete (*cusparsestruct)->cooPerm_a; 4365 delete (*cusparsestruct)->csr2csc_i; 4366 if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle)); 4367 if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d)); 4368 if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d)); 4369 PetscCall(PetscFree(*cusparsestruct)); 4370 } 4371 PetscFunctionReturn(0); 4372 } 4373 4374 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 4375 { 4376 PetscFunctionBegin; 4377 if (*mat) { 4378 delete (*mat)->values; 4379 delete (*mat)->column_indices; 4380 delete (*mat)->row_offsets; 4381 delete *mat; 4382 *mat = 0; 4383 } 4384 PetscFunctionReturn(0); 4385 } 4386 4387 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 4388 { 4389 PetscFunctionBegin; 4390 if (*trifactor) { 4391 if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr)); 4392 if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo)); 4393 PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 4394 if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer)); 4395 if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h)); 4396 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4397 if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer)); 4398 #endif 4399 PetscCall(PetscFree(*trifactor)); 4400 } 4401 PetscFunctionReturn(0); 4402 } 4403 4404 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 4405 { 4406 CsrMatrix *mat; 4407 4408 PetscFunctionBegin; 4409 if (*matstruct) { 4410 if ((*matstruct)->mat) { 4411 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 4412 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4413 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 4414 #else 4415 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 4416 PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat)); 4417 #endif 4418 } else { 4419 mat = (CsrMatrix*)(*matstruct)->mat; 4420 CsrMatrix_Destroy(&mat); 4421 } 4422 } 4423 if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr)); 4424 delete (*matstruct)->cprowIndices; 4425 if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one)); 4426 if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero)); 4427 if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one)); 4428 4429 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4430 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 4431 if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr)); 4432 for (int i=0; i<3; i++) { 4433 if (mdata->cuSpMV[i].initialized) { 4434 PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer)); 4435 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr)); 4436 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr)); 4437 } 4438 } 4439 #endif 4440 delete *matstruct; 4441 *matstruct = NULL; 4442 } 4443 PetscFunctionReturn(0); 4444 } 4445 4446 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors) 4447 { 4448 Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors; 4449 4450 PetscFunctionBegin; 4451 if (fs) { 4452 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr)); 4453 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr)); 4454 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose)); 4455 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose)); 4456 delete fs->rpermIndices; 4457 delete fs->cpermIndices; 4458 delete fs->workVector; 4459 fs->rpermIndices = NULL; 4460 fs->cpermIndices = NULL; 4461 fs->workVector = NULL; 4462 if (fs->a_band_d) PetscCallCUDA(cudaFree(fs->a_band_d)); 4463 if (fs->i_band_d) PetscCallCUDA(cudaFree(fs->i_band_d)); 4464 fs->init_dev_prop = PETSC_FALSE; 4465 #if CUSPARSE_VERSION >= 11500 4466 PetscCallCUDA(cudaFree(fs->csrRowPtr)); 4467 PetscCallCUDA(cudaFree(fs->csrColIdx)); 4468 PetscCallCUDA(cudaFree(fs->csrVal)); 4469 PetscCallCUDA(cudaFree(fs->X)); 4470 PetscCallCUDA(cudaFree(fs->Y)); 4471 PetscCallCUDA(cudaFree(fs->factBuffer_M)); 4472 PetscCallCUDA(cudaFree(fs->spsvBuffer_L)); 4473 PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt)); 4474 PetscCallCUDA(cudaFree(fs->spsvBuffer_U)); 4475 PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut)); 4476 PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M)); 4477 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L)); 4478 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U)); 4479 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L)); 4480 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt)); 4481 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U)); 4482 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut)); 4483 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X)); 4484 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y)); 4485 PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M)); 4486 PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M)); 4487 fs->builtSolveTranspose = PETSC_FALSE; 4488 #endif 4489 } 4490 PetscFunctionReturn(0); 4491 } 4492 4493 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 4494 { 4495 cusparseHandle_t handle; 4496 4497 PetscFunctionBegin; 4498 if (*trifactors) { 4499 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors)); 4500 if (handle = (*trifactors)->handle) { 4501 PetscCallCUSPARSE(cusparseDestroy(handle)); 4502 } 4503 PetscCall(PetscFree(*trifactors)); 4504 } 4505 PetscFunctionReturn(0); 4506 } 4507 4508 struct IJCompare 4509 { 4510 __host__ __device__ 4511 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 4512 { 4513 if (t1.get<0>() < t2.get<0>()) return true; 4514 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4515 return false; 4516 } 4517 }; 4518 4519 struct IJEqual 4520 { 4521 __host__ __device__ 4522 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 4523 { 4524 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 4525 return true; 4526 } 4527 }; 4528 4529 struct IJDiff 4530 { 4531 __host__ __device__ 4532 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 4533 { 4534 return t1 == t2 ? 0 : 1; 4535 } 4536 }; 4537 4538 struct IJSum 4539 { 4540 __host__ __device__ 4541 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 4542 { 4543 return t1||t2; 4544 } 4545 }; 4546 4547 #include <thrust/iterator/discard_iterator.h> 4548 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */ 4549 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 4550 { 4551 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4552 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4553 THRUSTARRAY *cooPerm_v = NULL; 4554 thrust::device_ptr<const PetscScalar> d_v; 4555 CsrMatrix *matrix; 4556 PetscInt n; 4557 4558 PetscFunctionBegin; 4559 PetscCheck(cusp,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 4560 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 4561 if (!cusp->cooPerm) { 4562 PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY)); 4563 PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY)); 4564 PetscFunctionReturn(0); 4565 } 4566 matrix = (CsrMatrix*)cusp->mat->mat; 4567 PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4568 if (!v) { 4569 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 4570 goto finalize; 4571 } 4572 n = cusp->cooPerm->size(); 4573 if (isCudaMem(v)) { 4574 d_v = thrust::device_pointer_cast(v); 4575 } else { 4576 cooPerm_v = new THRUSTARRAY(n); 4577 cooPerm_v->assign(v,v+n); 4578 d_v = cooPerm_v->data(); 4579 PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar))); 4580 } 4581 PetscCall(PetscLogGpuTimeBegin()); 4582 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 4583 if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 4584 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 4585 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 4586 /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 4587 cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 4588 cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 4589 */ 4590 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 4591 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 4592 delete cooPerm_w; 4593 } else { 4594 /* all nonzeros in d_v[] are unique entries */ 4595 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 4596 matrix->values->begin())); 4597 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 4598 matrix->values->end())); 4599 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 4600 } 4601 } else { 4602 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 4603 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 4604 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 4605 } else { 4606 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 4607 matrix->values->begin())); 4608 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 4609 matrix->values->end())); 4610 thrust::for_each(zibit,zieit,VecCUDAEquals()); 4611 } 4612 } 4613 PetscCall(PetscLogGpuTimeEnd()); 4614 finalize: 4615 delete cooPerm_v; 4616 A->offloadmask = PETSC_OFFLOAD_GPU; 4617 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4618 /* shorter version of MatAssemblyEnd_SeqAIJ */ 4619 PetscCall(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz)); 4620 PetscCall(PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n")); 4621 PetscCall(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax)); 4622 a->reallocs = 0; 4623 A->info.mallocs += 0; 4624 A->info.nz_unneeded = 0; 4625 A->assembled = A->was_assembled = PETSC_TRUE; 4626 A->num_ass++; 4627 PetscFunctionReturn(0); 4628 } 4629 4630 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 4631 { 4632 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4633 4634 PetscFunctionBegin; 4635 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4636 if (!cusp) PetscFunctionReturn(0); 4637 if (destroy) { 4638 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format)); 4639 delete cusp->csr2csc_i; 4640 cusp->csr2csc_i = NULL; 4641 } 4642 A->transupdated = PETSC_FALSE; 4643 PetscFunctionReturn(0); 4644 } 4645 4646 #include <thrust/binary_search.h> 4647 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */ 4648 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[]) 4649 { 4650 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4651 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4652 PetscInt cooPerm_n, nzr = 0; 4653 4654 PetscFunctionBegin; 4655 PetscCall(PetscLayoutSetUp(A->rmap)); 4656 PetscCall(PetscLayoutSetUp(A->cmap)); 4657 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 4658 if (n != cooPerm_n) { 4659 delete cusp->cooPerm; 4660 delete cusp->cooPerm_a; 4661 cusp->cooPerm = NULL; 4662 cusp->cooPerm_a = NULL; 4663 } 4664 if (n) { 4665 THRUSTINTARRAY d_i(n); 4666 THRUSTINTARRAY d_j(n); 4667 THRUSTINTARRAY ii(A->rmap->n); 4668 4669 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 4670 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 4671 4672 PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt))); 4673 d_i.assign(coo_i,coo_i+n); 4674 d_j.assign(coo_j,coo_j+n); 4675 4676 /* Ex. 4677 n = 6 4678 coo_i = [3,3,1,4,1,4] 4679 coo_j = [3,2,2,5,2,6] 4680 */ 4681 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 4682 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 4683 4684 PetscCall(PetscLogGpuTimeBegin()); 4685 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 4686 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 4687 *cusp->cooPerm_a = d_i; /* copy the sorted array */ 4688 THRUSTINTARRAY w = d_j; 4689 4690 /* 4691 d_i = [1,1,3,3,4,4] 4692 d_j = [2,2,2,3,5,6] 4693 cooPerm = [2,4,1,0,3,5] 4694 */ 4695 auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 4696 4697 /* 4698 d_i = [1,3,3,4,4,x] 4699 ^ekey 4700 d_j = [2,2,3,5,6,x] 4701 ^nekye 4702 */ 4703 if (nekey == ekey) { /* all entries are unique */ 4704 delete cusp->cooPerm_a; 4705 cusp->cooPerm_a = NULL; 4706 } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 4707 /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 4708 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 4709 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 4710 (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 4711 w[0] = 0; 4712 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 4713 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 4714 } 4715 thrust::counting_iterator<PetscInt> search_begin(0); 4716 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 4717 search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 4718 ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 4719 PetscCall(PetscLogGpuTimeEnd()); 4720 4721 PetscCall(MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i)); 4722 a->singlemalloc = PETSC_FALSE; 4723 a->free_a = PETSC_TRUE; 4724 a->free_ij = PETSC_TRUE; 4725 PetscCall(PetscMalloc1(A->rmap->n+1,&a->i)); 4726 a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 4727 PetscCallCUDA(cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 4728 a->nz = a->maxnz = a->i[A->rmap->n]; 4729 a->rmax = 0; 4730 PetscCall(PetscMalloc1(a->nz,&a->a)); 4731 PetscCall(PetscMalloc1(a->nz,&a->j)); 4732 PetscCallCUDA(cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 4733 if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n,&a->ilen)); 4734 if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n,&a->imax)); 4735 for (PetscInt i = 0; i < A->rmap->n; i++) { 4736 const PetscInt nnzr = a->i[i+1] - a->i[i]; 4737 nzr += (PetscInt)!!(nnzr); 4738 a->ilen[i] = a->imax[i] = nnzr; 4739 a->rmax = PetscMax(a->rmax,nnzr); 4740 } 4741 a->nonzerorowcnt = nzr; 4742 A->preallocated = PETSC_TRUE; 4743 PetscCall(PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt))); 4744 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 4745 } else { 4746 PetscCall(MatSeqAIJSetPreallocation(A,0,NULL)); 4747 } 4748 PetscCall(MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE)); 4749 4750 /* We want to allocate the CUSPARSE struct for matvec now. 4751 The code is so convoluted now that I prefer to copy zeros */ 4752 PetscCall(PetscArrayzero(a->a,a->nz)); 4753 PetscCall(MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6)); 4754 A->offloadmask = PETSC_OFFLOAD_CPU; 4755 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4756 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 4757 PetscFunctionReturn(0); 4758 } 4759 4760 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 4761 { 4762 Mat_SeqAIJ *seq; 4763 Mat_SeqAIJCUSPARSE *dev; 4764 PetscBool coo_basic = PETSC_TRUE; 4765 PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 4766 4767 PetscFunctionBegin; 4768 PetscCall(MatResetPreallocationCOO_SeqAIJ(mat)); 4769 PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat)); 4770 if (coo_i) { 4771 PetscCall(PetscGetMemType(coo_i,&mtype)); 4772 if (PetscMemTypeHost(mtype)) { 4773 for (PetscCount k=0; k<coo_n; k++) { 4774 if (coo_i[k] < 0 || coo_j[k] < 0) {coo_basic = PETSC_FALSE; break;} 4775 } 4776 } 4777 } 4778 4779 if (coo_basic) { /* i,j are on device or do not contain negative indices */ 4780 PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j)); 4781 } else { 4782 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j)); 4783 mat->offloadmask = PETSC_OFFLOAD_CPU; 4784 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat)); 4785 seq = static_cast<Mat_SeqAIJ*>(mat->data); 4786 dev = static_cast<Mat_SeqAIJCUSPARSE*>(mat->spptr); 4787 PetscCallCUDA(cudaMalloc((void**)&dev->jmap_d,(seq->nz+1)*sizeof(PetscCount))); 4788 PetscCallCUDA(cudaMemcpy(dev->jmap_d,seq->jmap,(seq->nz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice)); 4789 PetscCallCUDA(cudaMalloc((void**)&dev->perm_d,seq->Atot*sizeof(PetscCount))); 4790 PetscCallCUDA(cudaMemcpy(dev->perm_d,seq->perm,seq->Atot*sizeof(PetscCount),cudaMemcpyHostToDevice)); 4791 dev->use_extended_coo = PETSC_TRUE; 4792 } 4793 PetscFunctionReturn(0); 4794 } 4795 4796 __global__ static void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount jmap[],const PetscCount perm[],InsertMode imode,PetscScalar a[]) 4797 { 4798 PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 4799 const PetscCount grid_size = gridDim.x * blockDim.x; 4800 for (; i<nnz; i+= grid_size) { 4801 PetscScalar sum = 0.0; 4802 for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) sum += kv[perm[k]]; 4803 a[i] = (imode == INSERT_VALUES? 0.0 : a[i]) + sum; 4804 } 4805 } 4806 4807 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 4808 { 4809 Mat_SeqAIJ *seq = (Mat_SeqAIJ*)A->data; 4810 Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE*)A->spptr; 4811 PetscCount Annz = seq->nz; 4812 PetscMemType memtype; 4813 const PetscScalar *v1 = v; 4814 PetscScalar *Aa; 4815 4816 PetscFunctionBegin; 4817 if (dev->use_extended_coo) { 4818 PetscCall(PetscGetMemType(v,&memtype)); 4819 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 4820 PetscCallCUDA(cudaMalloc((void**)&v1,seq->coo_n*sizeof(PetscScalar))); 4821 PetscCallCUDA(cudaMemcpy((void*)v1,v,seq->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4822 } 4823 4824 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa)); 4825 else PetscCall(MatSeqAIJCUSPARSEGetArray(A,&Aa)); 4826 4827 if (Annz) { 4828 MatAddCOOValues<<<(Annz+255)/256,256>>>(v1,Annz,dev->jmap_d,dev->perm_d,imode,Aa); 4829 PetscCallCUDA(cudaPeekAtLastError()); 4830 } 4831 4832 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa)); 4833 else PetscCall(MatSeqAIJCUSPARSERestoreArray(A,&Aa)); 4834 4835 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void*)v1)); 4836 } else { 4837 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A,v,imode)); 4838 } 4839 PetscFunctionReturn(0); 4840 } 4841 4842 /*@C 4843 MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices. 4844 4845 Not collective 4846 4847 Input Parameters: 4848 + A - the matrix 4849 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4850 4851 Output Parameters: 4852 + ia - the CSR row pointers 4853 - ja - the CSR column indices 4854 4855 Level: developer 4856 4857 Notes: 4858 When compressed is true, the CSR structure does not contain empty rows 4859 4860 .seealso: `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()` 4861 @*/ 4862 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j) 4863 { 4864 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4865 CsrMatrix *csr; 4866 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4867 4868 PetscFunctionBegin; 4869 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4870 if (!i || !j) PetscFunctionReturn(0); 4871 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4872 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4873 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4874 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4875 csr = (CsrMatrix*)cusp->mat->mat; 4876 if (i) { 4877 if (!compressed && a->compressedrow.use) { /* need full row offset */ 4878 if (!cusp->rowoffsets_gpu) { 4879 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4880 cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4881 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 4882 } 4883 *i = cusp->rowoffsets_gpu->data().get(); 4884 } else *i = csr->row_offsets->data().get(); 4885 } 4886 if (j) *j = csr->column_indices->data().get(); 4887 PetscFunctionReturn(0); 4888 } 4889 4890 /*@C 4891 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ() 4892 4893 Not collective 4894 4895 Input Parameters: 4896 + A - the matrix 4897 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4898 4899 Output Parameters: 4900 + ia - the CSR row pointers 4901 - ja - the CSR column indices 4902 4903 Level: developer 4904 4905 .seealso: `MatSeqAIJCUSPARSEGetIJ()` 4906 @*/ 4907 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j) 4908 { 4909 PetscFunctionBegin; 4910 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4911 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4912 if (i) *i = NULL; 4913 if (j) *j = NULL; 4914 PetscFunctionReturn(0); 4915 } 4916 4917 /*@C 4918 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4919 4920 Not Collective 4921 4922 Input Parameter: 4923 . A - a MATSEQAIJCUSPARSE matrix 4924 4925 Output Parameter: 4926 . a - pointer to the device data 4927 4928 Level: developer 4929 4930 Notes: may trigger host-device copies if up-to-date matrix data is on host 4931 4932 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()` 4933 @*/ 4934 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 4935 { 4936 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4937 CsrMatrix *csr; 4938 4939 PetscFunctionBegin; 4940 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4941 PetscValidPointer(a,2); 4942 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4943 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4944 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4945 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4946 csr = (CsrMatrix*)cusp->mat->mat; 4947 PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4948 *a = csr->values->data().get(); 4949 PetscFunctionReturn(0); 4950 } 4951 4952 /*@C 4953 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead() 4954 4955 Not Collective 4956 4957 Input Parameter: 4958 . A - a MATSEQAIJCUSPARSE matrix 4959 4960 Output Parameter: 4961 . a - pointer to the device data 4962 4963 Level: developer 4964 4965 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()` 4966 @*/ 4967 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 4968 { 4969 PetscFunctionBegin; 4970 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4971 PetscValidPointer(a,2); 4972 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4973 *a = NULL; 4974 PetscFunctionReturn(0); 4975 } 4976 4977 /*@C 4978 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4979 4980 Not Collective 4981 4982 Input Parameter: 4983 . A - a MATSEQAIJCUSPARSE matrix 4984 4985 Output Parameter: 4986 . a - pointer to the device data 4987 4988 Level: developer 4989 4990 Notes: may trigger host-device copies if up-to-date matrix data is on host 4991 4992 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()` 4993 @*/ 4994 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 4995 { 4996 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4997 CsrMatrix *csr; 4998 4999 PetscFunctionBegin; 5000 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 5001 PetscValidPointer(a,2); 5002 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 5003 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 5004 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 5005 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 5006 csr = (CsrMatrix*)cusp->mat->mat; 5007 PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 5008 *a = csr->values->data().get(); 5009 A->offloadmask = PETSC_OFFLOAD_GPU; 5010 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 5011 PetscFunctionReturn(0); 5012 } 5013 /*@C 5014 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray() 5015 5016 Not Collective 5017 5018 Input Parameter: 5019 . A - a MATSEQAIJCUSPARSE matrix 5020 5021 Output Parameter: 5022 . a - pointer to the device data 5023 5024 Level: developer 5025 5026 .seealso: `MatSeqAIJCUSPARSEGetArray()` 5027 @*/ 5028 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 5029 { 5030 PetscFunctionBegin; 5031 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 5032 PetscValidPointer(a,2); 5033 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 5034 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 5035 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 5036 *a = NULL; 5037 PetscFunctionReturn(0); 5038 } 5039 5040 /*@C 5041 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 5042 5043 Not Collective 5044 5045 Input Parameter: 5046 . A - a MATSEQAIJCUSPARSE matrix 5047 5048 Output Parameter: 5049 . a - pointer to the device data 5050 5051 Level: developer 5052 5053 Notes: does not trigger host-device copies and flags data validity on the GPU 5054 5055 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()` 5056 @*/ 5057 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 5058 { 5059 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 5060 CsrMatrix *csr; 5061 5062 PetscFunctionBegin; 5063 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 5064 PetscValidPointer(a,2); 5065 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 5066 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 5067 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 5068 csr = (CsrMatrix*)cusp->mat->mat; 5069 PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 5070 *a = csr->values->data().get(); 5071 A->offloadmask = PETSC_OFFLOAD_GPU; 5072 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 5073 PetscFunctionReturn(0); 5074 } 5075 5076 /*@C 5077 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite() 5078 5079 Not Collective 5080 5081 Input Parameter: 5082 . A - a MATSEQAIJCUSPARSE matrix 5083 5084 Output Parameter: 5085 . a - pointer to the device data 5086 5087 Level: developer 5088 5089 .seealso: `MatSeqAIJCUSPARSEGetArrayWrite()` 5090 @*/ 5091 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 5092 { 5093 PetscFunctionBegin; 5094 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 5095 PetscValidPointer(a,2); 5096 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 5097 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 5098 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 5099 *a = NULL; 5100 PetscFunctionReturn(0); 5101 } 5102 5103 struct IJCompare4 5104 { 5105 __host__ __device__ 5106 inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 5107 { 5108 if (t1.get<0>() < t2.get<0>()) return true; 5109 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 5110 return false; 5111 } 5112 }; 5113 5114 struct Shift 5115 { 5116 int _shift; 5117 5118 Shift(int shift) : _shift(shift) {} 5119 __host__ __device__ 5120 inline int operator() (const int &c) 5121 { 5122 return c + _shift; 5123 } 5124 }; 5125 5126 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 5127 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 5128 { 5129 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 5130 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 5131 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 5132 CsrMatrix *Acsr,*Bcsr,*Ccsr; 5133 PetscInt Annz,Bnnz; 5134 cusparseStatus_t stat; 5135 PetscInt i,m,n,zero = 0; 5136 5137 PetscFunctionBegin; 5138 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 5139 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 5140 PetscValidPointer(C,4); 5141 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 5142 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 5143 PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 5144 PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 5145 PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 5146 PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 5147 if (reuse == MAT_INITIAL_MATRIX) { 5148 m = A->rmap->n; 5149 n = A->cmap->n + B->cmap->n; 5150 PetscCall(MatCreate(PETSC_COMM_SELF,C)); 5151 PetscCall(MatSetSizes(*C,m,n,m,n)); 5152 PetscCall(MatSetType(*C,MATSEQAIJCUSPARSE)); 5153 c = (Mat_SeqAIJ*)(*C)->data; 5154 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 5155 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 5156 Ccsr = new CsrMatrix; 5157 Cmat->cprowIndices = NULL; 5158 c->compressedrow.use = PETSC_FALSE; 5159 c->compressedrow.nrows = 0; 5160 c->compressedrow.i = NULL; 5161 c->compressedrow.rindex = NULL; 5162 Ccusp->workVector = NULL; 5163 Ccusp->nrows = m; 5164 Ccusp->mat = Cmat; 5165 Ccusp->mat->mat = Ccsr; 5166 Ccsr->num_rows = m; 5167 Ccsr->num_cols = n; 5168 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 5169 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 5170 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 5171 PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar))); 5172 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar))); 5173 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 5174 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 5175 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 5176 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 5177 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 5178 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 5179 PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 5180 PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 5181 5182 Acsr = (CsrMatrix*)Acusp->mat->mat; 5183 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 5184 Annz = (PetscInt)Acsr->column_indices->size(); 5185 Bnnz = (PetscInt)Bcsr->column_indices->size(); 5186 c->nz = Annz + Bnnz; 5187 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 5188 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 5189 Ccsr->values = new THRUSTARRAY(c->nz); 5190 Ccsr->num_entries = c->nz; 5191 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 5192 if (c->nz) { 5193 auto Acoo = new THRUSTINTARRAY32(Annz); 5194 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 5195 auto Ccoo = new THRUSTINTARRAY32(c->nz); 5196 THRUSTINTARRAY32 *Aroff,*Broff; 5197 5198 if (a->compressedrow.use) { /* need full row offset */ 5199 if (!Acusp->rowoffsets_gpu) { 5200 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 5201 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 5202 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 5203 } 5204 Aroff = Acusp->rowoffsets_gpu; 5205 } else Aroff = Acsr->row_offsets; 5206 if (b->compressedrow.use) { /* need full row offset */ 5207 if (!Bcusp->rowoffsets_gpu) { 5208 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 5209 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 5210 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt))); 5211 } 5212 Broff = Bcusp->rowoffsets_gpu; 5213 } else Broff = Bcsr->row_offsets; 5214 PetscCall(PetscLogGpuTimeBegin()); 5215 stat = cusparseXcsr2coo(Acusp->handle, 5216 Aroff->data().get(), 5217 Annz, 5218 m, 5219 Acoo->data().get(), 5220 CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat); 5221 stat = cusparseXcsr2coo(Bcusp->handle, 5222 Broff->data().get(), 5223 Bnnz, 5224 m, 5225 Bcoo->data().get(), 5226 CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat); 5227 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 5228 auto Aperm = thrust::make_constant_iterator(1); 5229 auto Bperm = thrust::make_constant_iterator(0); 5230 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 5231 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 5232 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 5233 #else 5234 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 5235 auto Bcib = Bcsr->column_indices->begin(); 5236 auto Bcie = Bcsr->column_indices->end(); 5237 thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 5238 #endif 5239 auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 5240 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 5241 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 5242 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 5243 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 5244 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 5245 auto p1 = Ccusp->cooPerm->begin(); 5246 auto p2 = Ccusp->cooPerm->begin(); 5247 thrust::advance(p2,Annz); 5248 PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 5249 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 5250 thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 5251 #endif 5252 auto cci = thrust::make_counting_iterator(zero); 5253 auto cce = thrust::make_counting_iterator(c->nz); 5254 #if 0 //Errors on SUMMIT cuda 11.1.0 5255 PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 5256 #else 5257 auto pred = thrust::identity<int>(); 5258 PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 5259 PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 5260 #endif 5261 stat = cusparseXcoo2csr(Ccusp->handle, 5262 Ccoo->data().get(), 5263 c->nz, 5264 m, 5265 Ccsr->row_offsets->data().get(), 5266 CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat); 5267 PetscCall(PetscLogGpuTimeEnd()); 5268 delete wPerm; 5269 delete Acoo; 5270 delete Bcoo; 5271 delete Ccoo; 5272 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 5273 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 5274 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 5275 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 5276 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 5277 #endif 5278 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 5279 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 5280 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 5281 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 5282 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 5283 CsrMatrix *CcsrT = new CsrMatrix; 5284 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 5285 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 5286 5287 (*C)->form_explicit_transpose = PETSC_TRUE; 5288 (*C)->transupdated = PETSC_TRUE; 5289 Ccusp->rowoffsets_gpu = NULL; 5290 CmatT->cprowIndices = NULL; 5291 CmatT->mat = CcsrT; 5292 CcsrT->num_rows = n; 5293 CcsrT->num_cols = m; 5294 CcsrT->num_entries = c->nz; 5295 5296 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 5297 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 5298 CcsrT->values = new THRUSTARRAY(c->nz); 5299 5300 PetscCall(PetscLogGpuTimeBegin()); 5301 auto rT = CcsrT->row_offsets->begin(); 5302 if (AT) { 5303 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 5304 thrust::advance(rT,-1); 5305 } 5306 if (BT) { 5307 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 5308 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 5309 thrust::copy(titb,tite,rT); 5310 } 5311 auto cT = CcsrT->column_indices->begin(); 5312 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 5313 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 5314 auto vT = CcsrT->values->begin(); 5315 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 5316 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 5317 PetscCall(PetscLogGpuTimeEnd()); 5318 5319 PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr)); 5320 PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO)); 5321 PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 5322 PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar))); 5323 PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar))); 5324 PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar))); 5325 PetscCallCUDA(cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 5326 PetscCallCUDA(cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 5327 PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 5328 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 5329 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 5330 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 5331 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 5332 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 5333 #endif 5334 Ccusp->matTranspose = CmatT; 5335 } 5336 } 5337 5338 c->singlemalloc = PETSC_FALSE; 5339 c->free_a = PETSC_TRUE; 5340 c->free_ij = PETSC_TRUE; 5341 PetscCall(PetscMalloc1(m+1,&c->i)); 5342 PetscCall(PetscMalloc1(c->nz,&c->j)); 5343 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 5344 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 5345 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 5346 ii = *Ccsr->row_offsets; 5347 jj = *Ccsr->column_indices; 5348 PetscCallCUDA(cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 5349 PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 5350 } else { 5351 PetscCallCUDA(cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 5352 PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 5353 } 5354 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt))); 5355 PetscCall(PetscMalloc1(m,&c->ilen)); 5356 PetscCall(PetscMalloc1(m,&c->imax)); 5357 c->maxnz = c->nz; 5358 c->nonzerorowcnt = 0; 5359 c->rmax = 0; 5360 for (i = 0; i < m; i++) { 5361 const PetscInt nn = c->i[i+1] - c->i[i]; 5362 c->ilen[i] = c->imax[i] = nn; 5363 c->nonzerorowcnt += (PetscInt)!!nn; 5364 c->rmax = PetscMax(c->rmax,nn); 5365 } 5366 PetscCall(MatMarkDiagonal_SeqAIJ(*C)); 5367 PetscCall(PetscMalloc1(c->nz,&c->a)); 5368 (*C)->nonzerostate++; 5369 PetscCall(PetscLayoutSetUp((*C)->rmap)); 5370 PetscCall(PetscLayoutSetUp((*C)->cmap)); 5371 Ccusp->nonzerostate = (*C)->nonzerostate; 5372 (*C)->preallocated = PETSC_TRUE; 5373 } else { 5374 PetscCheck((*C)->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n); 5375 c = (Mat_SeqAIJ*)(*C)->data; 5376 if (c->nz) { 5377 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 5378 PetscCheck(Ccusp->cooPerm,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 5379 PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 5380 PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate,PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 5381 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 5382 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 5383 PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 5384 PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 5385 Acsr = (CsrMatrix*)Acusp->mat->mat; 5386 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 5387 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 5388 PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size()); 5389 PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 5390 PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 5391 PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries,PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 5392 PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 5393 auto pmid = Ccusp->cooPerm->begin(); 5394 thrust::advance(pmid,Acsr->num_entries); 5395 PetscCall(PetscLogGpuTimeBegin()); 5396 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 5397 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 5398 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 5399 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 5400 thrust::for_each(zibait,zieait,VecCUDAEquals()); 5401 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 5402 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 5403 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 5404 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 5405 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 5406 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE)); 5407 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 5408 PetscCheck(Ccusp->matTranspose,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 5409 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 5410 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 5411 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 5412 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 5413 auto vT = CcsrT->values->begin(); 5414 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 5415 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 5416 (*C)->transupdated = PETSC_TRUE; 5417 } 5418 PetscCall(PetscLogGpuTimeEnd()); 5419 } 5420 } 5421 PetscCall(PetscObjectStateIncrease((PetscObject)*C)); 5422 (*C)->assembled = PETSC_TRUE; 5423 (*C)->was_assembled = PETSC_FALSE; 5424 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 5425 PetscFunctionReturn(0); 5426 } 5427 5428 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 5429 { 5430 bool dmem; 5431 const PetscScalar *av; 5432 5433 PetscFunctionBegin; 5434 dmem = isCudaMem(v); 5435 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A,&av)); 5436 if (n && idx) { 5437 THRUSTINTARRAY widx(n); 5438 widx.assign(idx,idx+n); 5439 PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt))); 5440 5441 THRUSTARRAY *w = NULL; 5442 thrust::device_ptr<PetscScalar> dv; 5443 if (dmem) { 5444 dv = thrust::device_pointer_cast(v); 5445 } else { 5446 w = new THRUSTARRAY(n); 5447 dv = w->data(); 5448 } 5449 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 5450 5451 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 5452 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 5453 thrust::for_each(zibit,zieit,VecCUDAEquals()); 5454 if (w) { 5455 PetscCallCUDA(cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost)); 5456 } 5457 delete w; 5458 } else { 5459 PetscCallCUDA(cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost)); 5460 } 5461 if (!dmem) PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar))); 5462 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A,&av)); 5463 PetscFunctionReturn(0); 5464 } 5465