1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 7 8 #include <petscconf.h> 9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10 #include <../src/mat/impls/sbaij/seq/sbaij.h> 11 #include <../src/vec/vec/impls/dvecimpl.h> 12 #include <petsc/private/vecimpl.h> 13 #undef VecType 14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15 #include <thrust/adjacent_difference.h> 16 #include <thrust/async/for_each.h> 17 #include <thrust/iterator/constant_iterator.h> 18 #include <thrust/remove.h> 19 #include <thrust/sort.h> 20 #include <thrust/unique.h> 21 22 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 23 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 24 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 25 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 26 27 typedef enum { 28 CUSPARSE_MV_ALG_DEFAULT = 0, 29 CUSPARSE_COOMV_ALG = 1, 30 CUSPARSE_CSRMV_ALG1 = 2, 31 CUSPARSE_CSRMV_ALG2 = 3 32 } cusparseSpMVAlg_t; 33 34 typedef enum { 35 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 36 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 37 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 38 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 39 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 40 CUSPARSE_SPMM_ALG_DEFAULT = 0, 41 CUSPARSE_SPMM_COO_ALG1 = 1, 42 CUSPARSE_SPMM_COO_ALG2 = 2, 43 CUSPARSE_SPMM_COO_ALG3 = 3, 44 CUSPARSE_SPMM_COO_ALG4 = 5, 45 CUSPARSE_SPMM_CSR_ALG1 = 4, 46 CUSPARSE_SPMM_CSR_ALG2 = 6, 47 } cusparseSpMMAlg_t; 48 49 typedef enum { 50 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 51 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 52 } cusparseCsr2CscAlg_t; 53 */ 54 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 55 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 56 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 57 #endif 58 59 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 60 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 61 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 62 63 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 64 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 65 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 66 67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 71 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 72 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 73 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 74 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 75 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 76 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 77 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 78 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 79 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 80 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 81 82 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 83 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 84 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 85 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 86 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 87 88 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 89 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 90 91 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 92 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscCount,const PetscInt[],const PetscInt[]); 93 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 94 95 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 96 { 97 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 98 99 PetscFunctionBegin; 100 switch (op) { 101 case MAT_CUSPARSE_MULT: 102 cusparsestruct->format = format; 103 break; 104 case MAT_CUSPARSE_ALL: 105 cusparsestruct->format = format; 106 break; 107 default: 108 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 109 } 110 PetscFunctionReturn(0); 111 } 112 113 /*@ 114 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 115 operation. Only the MatMult operation can use different GPU storage formats 116 for MPIAIJCUSPARSE matrices. 117 Not Collective 118 119 Input Parameters: 120 + A - Matrix of type SEQAIJCUSPARSE 121 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 122 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 123 124 Output Parameter: 125 126 Level: intermediate 127 128 .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 129 @*/ 130 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 131 { 132 PetscFunctionBegin; 133 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 134 PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format)); 135 PetscFunctionReturn(0); 136 } 137 138 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu) 139 { 140 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 141 142 PetscFunctionBegin; 143 cusparsestruct->use_cpu_solve = use_cpu; 144 PetscFunctionReturn(0); 145 } 146 147 /*@ 148 MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve. 149 150 Input Parameters: 151 + A - Matrix of type SEQAIJCUSPARSE 152 - use_cpu - set flag for using the built-in CPU MatSolve 153 154 Output Parameter: 155 156 Notes: 157 The cuSparse LU solver currently computes the factors with the built-in CPU method 158 and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 159 This method to specify if the solve is done on the CPU or GPU (GPU is the default). 160 161 Level: intermediate 162 163 .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 164 @*/ 165 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu) 166 { 167 PetscFunctionBegin; 168 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 169 PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu)); 170 PetscFunctionReturn(0); 171 } 172 173 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 174 { 175 PetscFunctionBegin; 176 switch (op) { 177 case MAT_FORM_EXPLICIT_TRANSPOSE: 178 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 179 if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 180 A->form_explicit_transpose = flg; 181 break; 182 default: 183 PetscCall(MatSetOption_SeqAIJ(A,op,flg)); 184 break; 185 } 186 PetscFunctionReturn(0); 187 } 188 189 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 190 191 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 192 { 193 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 194 IS isrow = b->row,iscol = b->col; 195 PetscBool row_identity,col_identity; 196 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr; 197 198 PetscFunctionBegin; 199 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 200 PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info)); 201 B->offloadmask = PETSC_OFFLOAD_CPU; 202 /* determine which version of MatSolve needs to be used. */ 203 PetscCall(ISIdentity(isrow,&row_identity)); 204 PetscCall(ISIdentity(iscol,&col_identity)); 205 206 if (!cusparsestruct->use_cpu_solve) { 207 if (row_identity && col_identity) { 208 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 209 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 210 } else { 211 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 212 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 213 } 214 } 215 B->ops->matsolve = NULL; 216 B->ops->matsolvetranspose = NULL; 217 218 /* get the triangular factors */ 219 if (!cusparsestruct->use_cpu_solve) { 220 PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B)); 221 } 222 PetscFunctionReturn(0); 223 } 224 225 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 226 { 227 MatCUSPARSEStorageFormat format; 228 PetscBool flg; 229 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 230 231 PetscFunctionBegin; 232 PetscOptionsHeadBegin(PetscOptionsObject,"SeqAIJCUSPARSE options"); 233 if (A->factortype == MAT_FACTOR_NONE) { 234 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 235 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg)); 236 if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format)); 237 238 PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 239 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg)); 240 if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format)); 241 PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg)); 242 if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve)); 243 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 244 PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 245 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg)); 246 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 247 #if CUSPARSE_VERSION > 11301 248 PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 249 #else 250 PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 251 #endif 252 PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 253 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg)); 254 PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 255 256 PetscCall(PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 257 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg)); 258 PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 259 #endif 260 } 261 PetscOptionsHeadEnd(); 262 PetscFunctionReturn(0); 263 } 264 265 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 266 { 267 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 268 PetscInt n = A->rmap->n; 269 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 270 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 271 const PetscInt *ai = a->i,*aj = a->j,*vi; 272 const MatScalar *aa = a->a,*v; 273 PetscInt *AiLo, *AjLo; 274 PetscInt i,nz, nzLower, offset, rowOffset; 275 276 PetscFunctionBegin; 277 if (!n) PetscFunctionReturn(0); 278 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 279 try { 280 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 281 nzLower=n+ai[n]-ai[1]; 282 if (!loTriFactor) { 283 PetscScalar *AALo; 284 285 PetscCallCUDA(cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar))); 286 287 /* Allocate Space for the lower triangular matrix */ 288 PetscCallCUDA(cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt))); 289 PetscCallCUDA(cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt))); 290 291 /* Fill the lower triangular matrix */ 292 AiLo[0] = (PetscInt) 0; 293 AiLo[n] = nzLower; 294 AjLo[0] = (PetscInt) 0; 295 AALo[0] = (MatScalar) 1.0; 296 v = aa; 297 vi = aj; 298 offset = 1; 299 rowOffset= 1; 300 for (i=1; i<n; i++) { 301 nz = ai[i+1] - ai[i]; 302 /* additional 1 for the term on the diagonal */ 303 AiLo[i] = rowOffset; 304 rowOffset += nz+1; 305 306 PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz)); 307 PetscCall(PetscArraycpy(&(AALo[offset]), v, nz)); 308 309 offset += nz; 310 AjLo[offset] = (PetscInt) i; 311 AALo[offset] = (MatScalar) 1.0; 312 offset += 1; 313 314 v += nz; 315 vi += nz; 316 } 317 318 /* allocate space for the triangular factor information */ 319 PetscCall(PetscNew(&loTriFactor)); 320 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 321 /* Create the matrix description */ 322 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 323 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 324 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 325 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 326 #else 327 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 328 #endif 329 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER)); 330 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 331 332 /* set the operation */ 333 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 334 335 /* set the matrix */ 336 loTriFactor->csrMat = new CsrMatrix; 337 loTriFactor->csrMat->num_rows = n; 338 loTriFactor->csrMat->num_cols = n; 339 loTriFactor->csrMat->num_entries = nzLower; 340 341 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 342 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 343 344 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 345 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 346 347 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 348 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 349 350 /* Create the solve analysis information */ 351 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 352 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 353 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 354 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 355 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 356 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 357 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 358 &loTriFactor->solveBufferSize)); 359 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize)); 360 #endif 361 362 /* perform the solve analysis */ 363 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 364 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 365 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 366 loTriFactor->csrMat->column_indices->data().get(), 367 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 368 loTriFactor->solveInfo, 369 loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 370 #else 371 loTriFactor->solveInfo)); 372 #endif 373 PetscCallCUDA(WaitForCUDA()); 374 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 375 376 /* assign the pointer */ 377 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 378 loTriFactor->AA_h = AALo; 379 PetscCallCUDA(cudaFreeHost(AiLo)); 380 PetscCallCUDA(cudaFreeHost(AjLo)); 381 PetscCall(PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar))); 382 } else { /* update values only */ 383 if (!loTriFactor->AA_h) { 384 PetscCallCUDA(cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar))); 385 } 386 /* Fill the lower triangular matrix */ 387 loTriFactor->AA_h[0] = 1.0; 388 v = aa; 389 vi = aj; 390 offset = 1; 391 for (i=1; i<n; i++) { 392 nz = ai[i+1] - ai[i]; 393 PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz)); 394 offset += nz; 395 loTriFactor->AA_h[offset] = 1.0; 396 offset += 1; 397 v += nz; 398 } 399 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 400 PetscCall(PetscLogCpuToGpu(nzLower*sizeof(PetscScalar))); 401 } 402 } catch(char *ex) { 403 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 404 } 405 } 406 PetscFunctionReturn(0); 407 } 408 409 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 410 { 411 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 412 PetscInt n = A->rmap->n; 413 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 414 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 415 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 416 const MatScalar *aa = a->a,*v; 417 PetscInt *AiUp, *AjUp; 418 PetscInt i,nz, nzUpper, offset; 419 420 PetscFunctionBegin; 421 if (!n) PetscFunctionReturn(0); 422 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 423 try { 424 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 425 nzUpper = adiag[0]-adiag[n]; 426 if (!upTriFactor) { 427 PetscScalar *AAUp; 428 429 PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar))); 430 431 /* Allocate Space for the upper triangular matrix */ 432 PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt))); 433 PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt))); 434 435 /* Fill the upper triangular matrix */ 436 AiUp[0]=(PetscInt) 0; 437 AiUp[n]=nzUpper; 438 offset = nzUpper; 439 for (i=n-1; i>=0; i--) { 440 v = aa + adiag[i+1] + 1; 441 vi = aj + adiag[i+1] + 1; 442 443 /* number of elements NOT on the diagonal */ 444 nz = adiag[i] - adiag[i+1]-1; 445 446 /* decrement the offset */ 447 offset -= (nz+1); 448 449 /* first, set the diagonal elements */ 450 AjUp[offset] = (PetscInt) i; 451 AAUp[offset] = (MatScalar)1./v[nz]; 452 AiUp[i] = AiUp[i+1] - (nz+1); 453 454 PetscCall(PetscArraycpy(&(AjUp[offset+1]), vi, nz)); 455 PetscCall(PetscArraycpy(&(AAUp[offset+1]), v, nz)); 456 } 457 458 /* allocate space for the triangular factor information */ 459 PetscCall(PetscNew(&upTriFactor)); 460 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 461 462 /* Create the matrix description */ 463 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 464 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 465 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 466 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 467 #else 468 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 469 #endif 470 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 471 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 472 473 /* set the operation */ 474 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 475 476 /* set the matrix */ 477 upTriFactor->csrMat = new CsrMatrix; 478 upTriFactor->csrMat->num_rows = n; 479 upTriFactor->csrMat->num_cols = n; 480 upTriFactor->csrMat->num_entries = nzUpper; 481 482 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 483 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 484 485 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 486 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 487 488 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 489 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 490 491 /* Create the solve analysis information */ 492 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 493 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 494 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 495 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 496 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 497 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 498 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 499 &upTriFactor->solveBufferSize)); 500 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize)); 501 #endif 502 503 /* perform the solve analysis */ 504 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 505 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 506 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 507 upTriFactor->csrMat->column_indices->data().get(), 508 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 509 upTriFactor->solveInfo, 510 upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 511 #else 512 upTriFactor->solveInfo)); 513 #endif 514 PetscCallCUDA(WaitForCUDA()); 515 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 516 517 /* assign the pointer */ 518 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 519 upTriFactor->AA_h = AAUp; 520 PetscCallCUDA(cudaFreeHost(AiUp)); 521 PetscCallCUDA(cudaFreeHost(AjUp)); 522 PetscCall(PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar))); 523 } else { 524 if (!upTriFactor->AA_h) { 525 PetscCallCUDA(cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar))); 526 } 527 /* Fill the upper triangular matrix */ 528 offset = nzUpper; 529 for (i=n-1; i>=0; i--) { 530 v = aa + adiag[i+1] + 1; 531 532 /* number of elements NOT on the diagonal */ 533 nz = adiag[i] - adiag[i+1]-1; 534 535 /* decrement the offset */ 536 offset -= (nz+1); 537 538 /* first, set the diagonal elements */ 539 upTriFactor->AA_h[offset] = 1./v[nz]; 540 PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz)); 541 } 542 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 543 PetscCall(PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar))); 544 } 545 } catch(char *ex) { 546 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 547 } 548 } 549 PetscFunctionReturn(0); 550 } 551 552 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 553 { 554 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 555 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 556 IS isrow = a->row,iscol = a->icol; 557 PetscBool row_identity,col_identity; 558 PetscInt n = A->rmap->n; 559 560 PetscFunctionBegin; 561 PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 562 PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A)); 563 PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A)); 564 565 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 566 cusparseTriFactors->nnz=a->nz; 567 568 A->offloadmask = PETSC_OFFLOAD_BOTH; 569 /* lower triangular indices */ 570 PetscCall(ISIdentity(isrow,&row_identity)); 571 if (!row_identity && !cusparseTriFactors->rpermIndices) { 572 const PetscInt *r; 573 574 PetscCall(ISGetIndices(isrow,&r)); 575 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 576 cusparseTriFactors->rpermIndices->assign(r, r+n); 577 PetscCall(ISRestoreIndices(isrow,&r)); 578 PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt))); 579 } 580 581 /* upper triangular indices */ 582 PetscCall(ISIdentity(iscol,&col_identity)); 583 if (!col_identity && !cusparseTriFactors->cpermIndices) { 584 const PetscInt *c; 585 586 PetscCall(ISGetIndices(iscol,&c)); 587 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 588 cusparseTriFactors->cpermIndices->assign(c, c+n); 589 PetscCall(ISRestoreIndices(iscol,&c)); 590 PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt))); 591 } 592 PetscFunctionReturn(0); 593 } 594 595 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 596 { 597 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 598 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 599 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 600 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 601 PetscInt *AiUp, *AjUp; 602 PetscScalar *AAUp; 603 PetscScalar *AALo; 604 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 605 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 606 const PetscInt *ai = b->i,*aj = b->j,*vj; 607 const MatScalar *aa = b->a,*v; 608 609 PetscFunctionBegin; 610 if (!n) PetscFunctionReturn(0); 611 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 612 try { 613 PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar))); 614 PetscCallCUDA(cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar))); 615 if (!upTriFactor && !loTriFactor) { 616 /* Allocate Space for the upper triangular matrix */ 617 PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt))); 618 PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt))); 619 620 /* Fill the upper triangular matrix */ 621 AiUp[0]=(PetscInt) 0; 622 AiUp[n]=nzUpper; 623 offset = 0; 624 for (i=0; i<n; i++) { 625 /* set the pointers */ 626 v = aa + ai[i]; 627 vj = aj + ai[i]; 628 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 629 630 /* first, set the diagonal elements */ 631 AjUp[offset] = (PetscInt) i; 632 AAUp[offset] = (MatScalar)1.0/v[nz]; 633 AiUp[i] = offset; 634 AALo[offset] = (MatScalar)1.0/v[nz]; 635 636 offset+=1; 637 if (nz>0) { 638 PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz)); 639 PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 640 for (j=offset; j<offset+nz; j++) { 641 AAUp[j] = -AAUp[j]; 642 AALo[j] = AAUp[j]/v[nz]; 643 } 644 offset+=nz; 645 } 646 } 647 648 /* allocate space for the triangular factor information */ 649 PetscCall(PetscNew(&upTriFactor)); 650 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 651 652 /* Create the matrix description */ 653 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 654 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 655 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 656 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 657 #else 658 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 659 #endif 660 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 661 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 662 663 /* set the matrix */ 664 upTriFactor->csrMat = new CsrMatrix; 665 upTriFactor->csrMat->num_rows = A->rmap->n; 666 upTriFactor->csrMat->num_cols = A->cmap->n; 667 upTriFactor->csrMat->num_entries = a->nz; 668 669 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 670 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 671 672 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 673 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 674 675 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 676 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 677 678 /* set the operation */ 679 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 680 681 /* Create the solve analysis information */ 682 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 683 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 684 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 685 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 686 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 687 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 688 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 689 &upTriFactor->solveBufferSize)); 690 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize)); 691 #endif 692 693 /* perform the solve analysis */ 694 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 695 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 696 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 697 upTriFactor->csrMat->column_indices->data().get(), 698 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 699 upTriFactor->solveInfo, 700 upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 701 #else 702 upTriFactor->solveInfo)); 703 #endif 704 PetscCallCUDA(WaitForCUDA()); 705 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 706 707 /* assign the pointer */ 708 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 709 710 /* allocate space for the triangular factor information */ 711 PetscCall(PetscNew(&loTriFactor)); 712 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 713 714 /* Create the matrix description */ 715 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 716 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 717 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 718 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 719 #else 720 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 721 #endif 722 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 723 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 724 725 /* set the operation */ 726 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 727 728 /* set the matrix */ 729 loTriFactor->csrMat = new CsrMatrix; 730 loTriFactor->csrMat->num_rows = A->rmap->n; 731 loTriFactor->csrMat->num_cols = A->cmap->n; 732 loTriFactor->csrMat->num_entries = a->nz; 733 734 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 735 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 736 737 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 738 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 739 740 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 741 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 742 743 /* Create the solve analysis information */ 744 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 745 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 746 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 747 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 748 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 749 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 750 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 751 &loTriFactor->solveBufferSize)); 752 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize)); 753 #endif 754 755 /* perform the solve analysis */ 756 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 757 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 758 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 759 loTriFactor->csrMat->column_indices->data().get(), 760 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 761 loTriFactor->solveInfo, 762 loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 763 #else 764 loTriFactor->solveInfo)); 765 #endif 766 PetscCallCUDA(WaitForCUDA()); 767 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 768 769 /* assign the pointer */ 770 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 771 772 PetscCall(PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))); 773 PetscCallCUDA(cudaFreeHost(AiUp)); 774 PetscCallCUDA(cudaFreeHost(AjUp)); 775 } else { 776 /* Fill the upper triangular matrix */ 777 offset = 0; 778 for (i=0; i<n; i++) { 779 /* set the pointers */ 780 v = aa + ai[i]; 781 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 782 783 /* first, set the diagonal elements */ 784 AAUp[offset] = 1.0/v[nz]; 785 AALo[offset] = 1.0/v[nz]; 786 787 offset+=1; 788 if (nz>0) { 789 PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 790 for (j=offset; j<offset+nz; j++) { 791 AAUp[j] = -AAUp[j]; 792 AALo[j] = AAUp[j]/v[nz]; 793 } 794 offset+=nz; 795 } 796 } 797 PetscCheck(upTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 798 PetscCheck(loTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 799 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 800 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 801 PetscCall(PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar))); 802 } 803 PetscCallCUDA(cudaFreeHost(AAUp)); 804 PetscCallCUDA(cudaFreeHost(AALo)); 805 } catch(char *ex) { 806 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 807 } 808 } 809 PetscFunctionReturn(0); 810 } 811 812 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 813 { 814 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 815 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 816 IS ip = a->row; 817 PetscBool perm_identity; 818 PetscInt n = A->rmap->n; 819 820 PetscFunctionBegin; 821 PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 822 PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A)); 823 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 824 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 825 826 A->offloadmask = PETSC_OFFLOAD_BOTH; 827 828 /* lower triangular indices */ 829 PetscCall(ISIdentity(ip,&perm_identity)); 830 if (!perm_identity) { 831 IS iip; 832 const PetscInt *irip,*rip; 833 834 PetscCall(ISInvertPermutation(ip,PETSC_DECIDE,&iip)); 835 PetscCall(ISGetIndices(iip,&irip)); 836 PetscCall(ISGetIndices(ip,&rip)); 837 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 838 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 839 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 840 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 841 PetscCall(ISRestoreIndices(iip,&irip)); 842 PetscCall(ISDestroy(&iip)); 843 PetscCall(ISRestoreIndices(ip,&rip)); 844 PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt))); 845 } 846 PetscFunctionReturn(0); 847 } 848 849 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 850 { 851 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 852 IS ip = b->row; 853 PetscBool perm_identity; 854 855 PetscFunctionBegin; 856 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 857 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B,A,info)); 858 B->offloadmask = PETSC_OFFLOAD_CPU; 859 /* determine which version of MatSolve needs to be used. */ 860 PetscCall(ISIdentity(ip,&perm_identity)); 861 if (perm_identity) { 862 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 863 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 864 B->ops->matsolve = NULL; 865 B->ops->matsolvetranspose = NULL; 866 } else { 867 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 868 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 869 B->ops->matsolve = NULL; 870 B->ops->matsolvetranspose = NULL; 871 } 872 873 /* get the triangular factors */ 874 PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B)); 875 PetscFunctionReturn(0); 876 } 877 878 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 879 { 880 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 881 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 882 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 883 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 884 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 885 cusparseIndexBase_t indexBase; 886 cusparseMatrixType_t matrixType; 887 cusparseFillMode_t fillMode; 888 cusparseDiagType_t diagType; 889 890 PetscFunctionBegin; 891 /* allocate space for the transpose of the lower triangular factor */ 892 PetscCall(PetscNew(&loTriFactorT)); 893 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 894 895 /* set the matrix descriptors of the lower triangular factor */ 896 matrixType = cusparseGetMatType(loTriFactor->descr); 897 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 898 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 899 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 900 diagType = cusparseGetMatDiagType(loTriFactor->descr); 901 902 /* Create the matrix description */ 903 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr)); 904 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 905 PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType)); 906 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode)); 907 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType)); 908 909 /* set the operation */ 910 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 911 912 /* allocate GPU space for the CSC of the lower triangular factor*/ 913 loTriFactorT->csrMat = new CsrMatrix; 914 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 915 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 916 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 917 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 918 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 919 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 920 921 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 922 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 923 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 924 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 925 loTriFactor->csrMat->values->data().get(), 926 loTriFactor->csrMat->row_offsets->data().get(), 927 loTriFactor->csrMat->column_indices->data().get(), 928 loTriFactorT->csrMat->values->data().get(), 929 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 930 CUSPARSE_ACTION_NUMERIC,indexBase, 931 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 932 PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize)); 933 #endif 934 935 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 936 PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 937 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 938 loTriFactor->csrMat->values->data().get(), 939 loTriFactor->csrMat->row_offsets->data().get(), 940 loTriFactor->csrMat->column_indices->data().get(), 941 loTriFactorT->csrMat->values->data().get(), 942 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 943 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 944 CUSPARSE_ACTION_NUMERIC, indexBase, 945 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer)); 946 #else 947 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 948 CUSPARSE_ACTION_NUMERIC, indexBase)); 949 #endif 950 PetscCallCUDA(WaitForCUDA()); 951 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 952 953 /* Create the solve analysis information */ 954 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 955 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo)); 956 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 957 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 958 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 959 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 960 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 961 &loTriFactorT->solveBufferSize)); 962 PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize)); 963 #endif 964 965 /* perform the solve analysis */ 966 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 967 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 968 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 969 loTriFactorT->csrMat->column_indices->data().get(), 970 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 971 loTriFactorT->solveInfo, 972 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 973 #else 974 loTriFactorT->solveInfo)); 975 #endif 976 PetscCallCUDA(WaitForCUDA()); 977 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 978 979 /* assign the pointer */ 980 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 981 982 /*********************************************/ 983 /* Now the Transpose of the Upper Tri Factor */ 984 /*********************************************/ 985 986 /* allocate space for the transpose of the upper triangular factor */ 987 PetscCall(PetscNew(&upTriFactorT)); 988 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 989 990 /* set the matrix descriptors of the upper triangular factor */ 991 matrixType = cusparseGetMatType(upTriFactor->descr); 992 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 993 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 994 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 995 diagType = cusparseGetMatDiagType(upTriFactor->descr); 996 997 /* Create the matrix description */ 998 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr)); 999 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 1000 PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType)); 1001 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode)); 1002 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType)); 1003 1004 /* set the operation */ 1005 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1006 1007 /* allocate GPU space for the CSC of the upper triangular factor*/ 1008 upTriFactorT->csrMat = new CsrMatrix; 1009 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1010 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1011 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1012 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1013 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1014 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1015 1016 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1017 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1018 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1019 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1020 upTriFactor->csrMat->values->data().get(), 1021 upTriFactor->csrMat->row_offsets->data().get(), 1022 upTriFactor->csrMat->column_indices->data().get(), 1023 upTriFactorT->csrMat->values->data().get(), 1024 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1025 CUSPARSE_ACTION_NUMERIC,indexBase, 1026 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 1027 PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize)); 1028 #endif 1029 1030 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1031 PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1032 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1033 upTriFactor->csrMat->values->data().get(), 1034 upTriFactor->csrMat->row_offsets->data().get(), 1035 upTriFactor->csrMat->column_indices->data().get(), 1036 upTriFactorT->csrMat->values->data().get(), 1037 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1038 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1039 CUSPARSE_ACTION_NUMERIC, indexBase, 1040 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer)); 1041 #else 1042 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1043 CUSPARSE_ACTION_NUMERIC, indexBase)); 1044 #endif 1045 1046 PetscCallCUDA(WaitForCUDA()); 1047 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1048 1049 /* Create the solve analysis information */ 1050 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1051 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo)); 1052 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1053 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1054 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1055 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1056 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1057 &upTriFactorT->solveBufferSize)); 1058 PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize)); 1059 #endif 1060 1061 /* perform the solve analysis */ 1062 /* christ, would it have killed you to put this stuff in a function????????? */ 1063 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1064 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1065 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1066 upTriFactorT->csrMat->column_indices->data().get(), 1067 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1068 upTriFactorT->solveInfo, 1069 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1070 #else 1071 upTriFactorT->solveInfo)); 1072 #endif 1073 1074 PetscCallCUDA(WaitForCUDA()); 1075 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1076 1077 /* assign the pointer */ 1078 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1079 PetscFunctionReturn(0); 1080 } 1081 1082 struct PetscScalarToPetscInt 1083 { 1084 __host__ __device__ 1085 PetscInt operator()(PetscScalar s) 1086 { 1087 return (PetscInt)PetscRealPart(s); 1088 } 1089 }; 1090 1091 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1092 { 1093 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1094 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1095 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1096 cusparseStatus_t stat; 1097 cusparseIndexBase_t indexBase; 1098 1099 PetscFunctionBegin; 1100 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1101 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1102 PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct"); 1103 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1104 PetscCheck(!A->transupdated || matstructT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct"); 1105 if (A->transupdated) PetscFunctionReturn(0); 1106 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1107 PetscCall(PetscLogGpuTimeBegin()); 1108 if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1109 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 1110 } 1111 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1112 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1113 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr)); 1114 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1115 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase)); 1116 PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1117 1118 /* set alpha and beta */ 1119 PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar))); 1120 PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar))); 1121 PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar))); 1122 PetscCallCUDA(cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1123 PetscCallCUDA(cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1124 PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1125 1126 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1127 CsrMatrix *matrixT = new CsrMatrix; 1128 matstructT->mat = matrixT; 1129 matrixT->num_rows = A->cmap->n; 1130 matrixT->num_cols = A->rmap->n; 1131 matrixT->num_entries = a->nz; 1132 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1133 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1134 matrixT->values = new THRUSTARRAY(a->nz); 1135 1136 if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 1137 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1138 1139 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1140 #if PETSC_PKG_CUDA_VERSION_GE(11,2,1) 1141 stat = cusparseCreateCsr(&matstructT->matDescr, 1142 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1143 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1144 matrixT->values->data().get(), 1145 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1146 indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat); 1147 #else 1148 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 1149 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 1150 1151 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 1152 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 1153 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 1154 */ 1155 if (matrixT->num_entries) { 1156 stat = cusparseCreateCsr(&matstructT->matDescr, 1157 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1158 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1159 matrixT->values->data().get(), 1160 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, 1161 indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat); 1162 1163 } else { 1164 matstructT->matDescr = NULL; 1165 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1166 } 1167 #endif 1168 #endif 1169 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1170 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1171 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1172 #else 1173 CsrMatrix *temp = new CsrMatrix; 1174 CsrMatrix *tempT = new CsrMatrix; 1175 /* First convert HYB to CSR */ 1176 temp->num_rows = A->rmap->n; 1177 temp->num_cols = A->cmap->n; 1178 temp->num_entries = a->nz; 1179 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1180 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1181 temp->values = new THRUSTARRAY(a->nz); 1182 1183 stat = cusparse_hyb2csr(cusparsestruct->handle, 1184 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1185 temp->values->data().get(), 1186 temp->row_offsets->data().get(), 1187 temp->column_indices->data().get());PetscCallCUSPARSE(stat); 1188 1189 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1190 tempT->num_rows = A->rmap->n; 1191 tempT->num_cols = A->cmap->n; 1192 tempT->num_entries = a->nz; 1193 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1194 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1195 tempT->values = new THRUSTARRAY(a->nz); 1196 1197 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1198 temp->num_cols, temp->num_entries, 1199 temp->values->data().get(), 1200 temp->row_offsets->data().get(), 1201 temp->column_indices->data().get(), 1202 tempT->values->data().get(), 1203 tempT->column_indices->data().get(), 1204 tempT->row_offsets->data().get(), 1205 CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat); 1206 1207 /* Last, convert CSC to HYB */ 1208 cusparseHybMat_t hybMat; 1209 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 1210 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1211 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1212 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1213 matstructT->descr, tempT->values->data().get(), 1214 tempT->row_offsets->data().get(), 1215 tempT->column_indices->data().get(), 1216 hybMat, 0, partition);PetscCallCUSPARSE(stat); 1217 1218 /* assign the pointer */ 1219 matstructT->mat = hybMat; 1220 A->transupdated = PETSC_TRUE; 1221 /* delete temporaries */ 1222 if (tempT) { 1223 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1224 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1225 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1226 delete (CsrMatrix*) tempT; 1227 } 1228 if (temp) { 1229 if (temp->values) delete (THRUSTARRAY*) temp->values; 1230 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1231 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1232 delete (CsrMatrix*) temp; 1233 } 1234 #endif 1235 } 1236 } 1237 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1238 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1239 CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1240 PetscCheck(matrix,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix"); 1241 PetscCheck(matrix->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows"); 1242 PetscCheck(matrix->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols"); 1243 PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values"); 1244 PetscCheck(matrixT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT"); 1245 PetscCheck(matrixT->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows"); 1246 PetscCheck(matrixT->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols"); 1247 PetscCheck(matrixT->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values"); 1248 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1249 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1250 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1251 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 1252 } 1253 if (!cusparsestruct->csr2csc_i) { 1254 THRUSTARRAY csr2csc_a(matrix->num_entries); 1255 PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1256 1257 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1258 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1259 void *csr2cscBuffer; 1260 size_t csr2cscBufferSize; 1261 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1262 A->cmap->n, matrix->num_entries, 1263 matrix->values->data().get(), 1264 cusparsestruct->rowoffsets_gpu->data().get(), 1265 matrix->column_indices->data().get(), 1266 matrixT->values->data().get(), 1267 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1268 CUSPARSE_ACTION_NUMERIC,indexBase, 1269 cusparsestruct->csr2cscAlg, &csr2cscBufferSize);PetscCallCUSPARSE(stat); 1270 PetscCallCUDA(cudaMalloc(&csr2cscBuffer,csr2cscBufferSize)); 1271 #endif 1272 1273 if (matrix->num_entries) { 1274 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1275 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1276 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1277 1278 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1279 should be filled with indexBase. So I just take a shortcut here. 1280 */ 1281 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1282 A->cmap->n,matrix->num_entries, 1283 csr2csc_a.data().get(), 1284 cusparsestruct->rowoffsets_gpu->data().get(), 1285 matrix->column_indices->data().get(), 1286 matrixT->values->data().get(), 1287 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1288 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1289 CUSPARSE_ACTION_NUMERIC,indexBase, 1290 cusparsestruct->csr2cscAlg, csr2cscBuffer);PetscCallCUSPARSE(stat); 1291 #else 1292 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1293 CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat); 1294 #endif 1295 } else { 1296 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1297 } 1298 1299 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1300 PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1301 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1302 PetscCallCUDA(cudaFree(csr2cscBuffer)); 1303 #endif 1304 } 1305 PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1306 thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1307 matrixT->values->begin())); 1308 } 1309 PetscCall(PetscLogGpuTimeEnd()); 1310 PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1311 /* the compressed row indices is not used for matTranspose */ 1312 matstructT->cprowIndices = NULL; 1313 /* assign the pointer */ 1314 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1315 A->transupdated = PETSC_TRUE; 1316 PetscFunctionReturn(0); 1317 } 1318 1319 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1320 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1321 { 1322 PetscInt n = xx->map->n; 1323 const PetscScalar *barray; 1324 PetscScalar *xarray; 1325 thrust::device_ptr<const PetscScalar> bGPU; 1326 thrust::device_ptr<PetscScalar> xGPU; 1327 cusparseStatus_t stat; 1328 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1329 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1330 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1331 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1332 1333 PetscFunctionBegin; 1334 /* Analyze the matrix and create the transpose ... on the fly */ 1335 if (!loTriFactorT && !upTriFactorT) { 1336 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1337 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1338 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1339 } 1340 1341 /* Get the GPU pointers */ 1342 PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 1343 PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1344 xGPU = thrust::device_pointer_cast(xarray); 1345 bGPU = thrust::device_pointer_cast(barray); 1346 1347 PetscCall(PetscLogGpuTimeBegin()); 1348 /* First, reorder with the row permutation */ 1349 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1350 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1351 xGPU); 1352 1353 /* First, solve U */ 1354 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1355 upTriFactorT->csrMat->num_rows, 1356 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1357 upTriFactorT->csrMat->num_entries, 1358 #endif 1359 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1360 upTriFactorT->csrMat->values->data().get(), 1361 upTriFactorT->csrMat->row_offsets->data().get(), 1362 upTriFactorT->csrMat->column_indices->data().get(), 1363 upTriFactorT->solveInfo, 1364 xarray, 1365 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1366 tempGPU->data().get(), 1367 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1368 #else 1369 tempGPU->data().get());PetscCallCUSPARSE(stat); 1370 #endif 1371 1372 /* Then, solve L */ 1373 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1374 loTriFactorT->csrMat->num_rows, 1375 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1376 loTriFactorT->csrMat->num_entries, 1377 #endif 1378 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1379 loTriFactorT->csrMat->values->data().get(), 1380 loTriFactorT->csrMat->row_offsets->data().get(), 1381 loTriFactorT->csrMat->column_indices->data().get(), 1382 loTriFactorT->solveInfo, 1383 tempGPU->data().get(), 1384 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1385 xarray, 1386 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1387 #else 1388 xarray);PetscCallCUSPARSE(stat); 1389 #endif 1390 1391 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1392 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1393 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1394 tempGPU->begin()); 1395 1396 /* Copy the temporary to the full solution. */ 1397 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1398 1399 /* restore */ 1400 PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 1401 PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 1402 PetscCall(PetscLogGpuTimeEnd()); 1403 PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1404 PetscFunctionReturn(0); 1405 } 1406 1407 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1408 { 1409 const PetscScalar *barray; 1410 PetscScalar *xarray; 1411 cusparseStatus_t stat; 1412 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1413 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1414 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1415 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1416 1417 PetscFunctionBegin; 1418 /* Analyze the matrix and create the transpose ... on the fly */ 1419 if (!loTriFactorT && !upTriFactorT) { 1420 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1421 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1422 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1423 } 1424 1425 /* Get the GPU pointers */ 1426 PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 1427 PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1428 1429 PetscCall(PetscLogGpuTimeBegin()); 1430 /* First, solve U */ 1431 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1432 upTriFactorT->csrMat->num_rows, 1433 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1434 upTriFactorT->csrMat->num_entries, 1435 #endif 1436 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1437 upTriFactorT->csrMat->values->data().get(), 1438 upTriFactorT->csrMat->row_offsets->data().get(), 1439 upTriFactorT->csrMat->column_indices->data().get(), 1440 upTriFactorT->solveInfo, 1441 barray, 1442 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1443 tempGPU->data().get(), 1444 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1445 #else 1446 tempGPU->data().get());PetscCallCUSPARSE(stat); 1447 #endif 1448 1449 /* Then, solve L */ 1450 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1451 loTriFactorT->csrMat->num_rows, 1452 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1453 loTriFactorT->csrMat->num_entries, 1454 #endif 1455 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1456 loTriFactorT->csrMat->values->data().get(), 1457 loTriFactorT->csrMat->row_offsets->data().get(), 1458 loTriFactorT->csrMat->column_indices->data().get(), 1459 loTriFactorT->solveInfo, 1460 tempGPU->data().get(), 1461 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1462 xarray, 1463 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1464 #else 1465 xarray);PetscCallCUSPARSE(stat); 1466 #endif 1467 1468 /* restore */ 1469 PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 1470 PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 1471 PetscCall(PetscLogGpuTimeEnd()); 1472 PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1473 PetscFunctionReturn(0); 1474 } 1475 1476 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1477 { 1478 const PetscScalar *barray; 1479 PetscScalar *xarray; 1480 thrust::device_ptr<const PetscScalar> bGPU; 1481 thrust::device_ptr<PetscScalar> xGPU; 1482 cusparseStatus_t stat; 1483 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1484 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1485 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1486 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1487 1488 PetscFunctionBegin; 1489 1490 /* Get the GPU pointers */ 1491 PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 1492 PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1493 xGPU = thrust::device_pointer_cast(xarray); 1494 bGPU = thrust::device_pointer_cast(barray); 1495 1496 PetscCall(PetscLogGpuTimeBegin()); 1497 /* First, reorder with the row permutation */ 1498 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1499 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1500 tempGPU->begin()); 1501 1502 /* Next, solve L */ 1503 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1504 loTriFactor->csrMat->num_rows, 1505 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1506 loTriFactor->csrMat->num_entries, 1507 #endif 1508 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1509 loTriFactor->csrMat->values->data().get(), 1510 loTriFactor->csrMat->row_offsets->data().get(), 1511 loTriFactor->csrMat->column_indices->data().get(), 1512 loTriFactor->solveInfo, 1513 tempGPU->data().get(), 1514 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1515 xarray, 1516 loTriFactor->solvePolicy, loTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1517 #else 1518 xarray);PetscCallCUSPARSE(stat); 1519 #endif 1520 1521 /* Then, solve U */ 1522 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1523 upTriFactor->csrMat->num_rows, 1524 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1525 upTriFactor->csrMat->num_entries, 1526 #endif 1527 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1528 upTriFactor->csrMat->values->data().get(), 1529 upTriFactor->csrMat->row_offsets->data().get(), 1530 upTriFactor->csrMat->column_indices->data().get(), 1531 upTriFactor->solveInfo,xarray, 1532 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1533 tempGPU->data().get(), 1534 upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1535 #else 1536 tempGPU->data().get());PetscCallCUSPARSE(stat); 1537 #endif 1538 1539 /* Last, reorder with the column permutation */ 1540 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1541 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1542 xGPU); 1543 1544 PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 1545 PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 1546 PetscCall(PetscLogGpuTimeEnd()); 1547 PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1548 PetscFunctionReturn(0); 1549 } 1550 1551 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1552 { 1553 const PetscScalar *barray; 1554 PetscScalar *xarray; 1555 cusparseStatus_t stat; 1556 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1557 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1558 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1559 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1560 1561 PetscFunctionBegin; 1562 /* Get the GPU pointers */ 1563 PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 1564 PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1565 1566 PetscCall(PetscLogGpuTimeBegin()); 1567 /* First, solve L */ 1568 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1569 loTriFactor->csrMat->num_rows, 1570 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1571 loTriFactor->csrMat->num_entries, 1572 #endif 1573 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1574 loTriFactor->csrMat->values->data().get(), 1575 loTriFactor->csrMat->row_offsets->data().get(), 1576 loTriFactor->csrMat->column_indices->data().get(), 1577 loTriFactor->solveInfo, 1578 barray, 1579 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1580 tempGPU->data().get(), 1581 loTriFactor->solvePolicy,loTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1582 #else 1583 tempGPU->data().get());PetscCallCUSPARSE(stat); 1584 #endif 1585 1586 /* Next, solve U */ 1587 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1588 upTriFactor->csrMat->num_rows, 1589 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1590 upTriFactor->csrMat->num_entries, 1591 #endif 1592 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1593 upTriFactor->csrMat->values->data().get(), 1594 upTriFactor->csrMat->row_offsets->data().get(), 1595 upTriFactor->csrMat->column_indices->data().get(), 1596 upTriFactor->solveInfo, 1597 tempGPU->data().get(), 1598 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1599 xarray, 1600 upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1601 #else 1602 xarray);PetscCallCUSPARSE(stat); 1603 #endif 1604 1605 PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 1606 PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 1607 PetscCall(PetscLogGpuTimeEnd()); 1608 PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1609 PetscFunctionReturn(0); 1610 } 1611 1612 #if CUSPARSE_VERSION >= 11500 1613 /* cusparseSpSV_solve() and friends first appeared in cusparse-11.3 */ 1614 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ILU0(Mat fact,Vec b,Vec x) 1615 { 1616 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr; 1617 Mat_SeqAIJ *aij = (Mat_SeqAIJ*)fact->data; 1618 const PetscScalar *barray; 1619 PetscScalar *xarray; 1620 1621 PetscFunctionBegin; 1622 PetscCall(VecCUDAGetArrayWrite(x,&xarray)); 1623 PetscCall(VecCUDAGetArrayRead(b,&barray)); 1624 PetscCall(PetscLogGpuTimeBegin()); 1625 1626 /* Solve L*y = b */ 1627 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,(void*)barray)); 1628 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y,fs->Y)); 1629 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, 1630 CUSPARSE_OPERATION_NON_TRANSPOSE, 1631 &PETSC_CUSPARSE_ONE, 1632 fs->spMatDescr_L, /* L Y = X */ 1633 fs->dnVecDescr_X, 1634 fs->dnVecDescr_Y, 1635 cusparse_scalartype, 1636 CUSPARSE_SPSV_ALG_DEFAULT, 1637 fs->spsvDescr_L)); 1638 1639 /* Solve U*x = y */ 1640 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,xarray)); 1641 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, 1642 CUSPARSE_OPERATION_NON_TRANSPOSE, 1643 &PETSC_CUSPARSE_ONE, 1644 fs->spMatDescr_U, /* U X = Y */ 1645 fs->dnVecDescr_Y, 1646 fs->dnVecDescr_X, 1647 cusparse_scalartype, 1648 CUSPARSE_SPSV_ALG_DEFAULT, 1649 fs->spsvDescr_U)); 1650 1651 PetscCall(VecCUDARestoreArrayRead(b,&barray)); 1652 PetscCall(VecCUDARestoreArrayWrite(x,&xarray)); 1653 1654 PetscCall(PetscLogGpuTimeEnd()); 1655 PetscCall(PetscLogGpuFlops(2.0*aij->nz - fact->rmap->n)); 1656 PetscFunctionReturn(0); 1657 } 1658 1659 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_ILU0(Mat fact,Vec b,Vec x) 1660 { 1661 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr; 1662 Mat_SeqAIJ *aij = (Mat_SeqAIJ*)fact->data; 1663 const PetscScalar *barray; 1664 PetscScalar *xarray; 1665 1666 PetscFunctionBegin; 1667 if (!fs->builtSolveTranspose) { /* Call MatSolveTranspose() for the first time */ 1668 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 1669 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, 1670 CUSPARSE_OPERATION_TRANSPOSE, 1671 &PETSC_CUSPARSE_ONE, 1672 fs->spMatDescr_L, /* The matrix is still L. We only do tranpose solve with it */ 1673 fs->dnVecDescr_X, 1674 fs->dnVecDescr_Y, 1675 cusparse_scalartype, 1676 CUSPARSE_SPSV_ALG_DEFAULT, 1677 fs->spsvDescr_Lt, 1678 &fs->spsvBufferSize_Lt)); 1679 1680 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); 1681 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, 1682 CUSPARSE_OPERATION_TRANSPOSE, 1683 &PETSC_CUSPARSE_ONE, 1684 fs->spMatDescr_U, 1685 fs->dnVecDescr_X, 1686 fs->dnVecDescr_Y, 1687 cusparse_scalartype, 1688 CUSPARSE_SPSV_ALG_DEFAULT, 1689 fs->spsvDescr_Ut, 1690 &fs->spsvBufferSize_Ut)); 1691 1692 PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_Ut,fs->spsvBufferSize_Ut)); 1693 PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_Lt,fs->spsvBufferSize_Lt)); 1694 1695 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, 1696 CUSPARSE_OPERATION_TRANSPOSE, 1697 &PETSC_CUSPARSE_ONE, 1698 fs->spMatDescr_L, 1699 fs->dnVecDescr_X, 1700 fs->dnVecDescr_Y, 1701 cusparse_scalartype, 1702 CUSPARSE_SPSV_ALG_DEFAULT, 1703 fs->spsvDescr_Lt, 1704 fs->spsvBuffer_Lt)); 1705 1706 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, 1707 CUSPARSE_OPERATION_TRANSPOSE, 1708 &PETSC_CUSPARSE_ONE, 1709 fs->spMatDescr_U, 1710 fs->dnVecDescr_X, 1711 fs->dnVecDescr_Y, 1712 cusparse_scalartype, 1713 CUSPARSE_SPSV_ALG_DEFAULT, 1714 fs->spsvDescr_Ut, 1715 fs->spsvBuffer_Ut)); 1716 fs->builtSolveTranspose = PETSC_TRUE; 1717 } 1718 1719 PetscCall(VecCUDAGetArrayWrite(x,&xarray)); 1720 PetscCall(VecCUDAGetArrayRead(b,&barray)); 1721 PetscCall(PetscLogGpuTimeBegin()); 1722 1723 /* Solve Ut*y = b */ 1724 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,(void*)barray)); 1725 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y,fs->Y)); 1726 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, 1727 CUSPARSE_OPERATION_TRANSPOSE, 1728 &PETSC_CUSPARSE_ONE, 1729 fs->spMatDescr_U, /* Ut Y = X */ 1730 fs->dnVecDescr_X, 1731 fs->dnVecDescr_Y, 1732 cusparse_scalartype, 1733 CUSPARSE_SPSV_ALG_DEFAULT, 1734 fs->spsvDescr_Ut)); 1735 1736 /* Solve Lt*x = y */ 1737 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,xarray)); 1738 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, 1739 CUSPARSE_OPERATION_TRANSPOSE, 1740 &PETSC_CUSPARSE_ONE, 1741 fs->spMatDescr_L, /* Lt X = Y */ 1742 fs->dnVecDescr_Y, 1743 fs->dnVecDescr_X, 1744 cusparse_scalartype, 1745 CUSPARSE_SPSV_ALG_DEFAULT, 1746 fs->spsvDescr_Lt)); 1747 1748 PetscCall(VecCUDARestoreArrayRead(b,&barray)); 1749 PetscCall(VecCUDARestoreArrayWrite(x,&xarray)); 1750 PetscCall(PetscLogGpuTimeEnd()); 1751 PetscCall(PetscLogGpuFlops(2.0*aij->nz - fact->rmap->n)); 1752 PetscFunctionReturn(0); 1753 } 1754 1755 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact,Mat A,const MatFactorInfo *info) 1756 { 1757 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr; 1758 Mat_SeqAIJ *aij = (Mat_SeqAIJ*)fact->data; 1759 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1760 CsrMatrix *Acsr; 1761 PetscInt m,nz; 1762 PetscBool flg; 1763 1764 PetscFunctionBegin; 1765 if (PetscDefined(USE_DEBUG)) { 1766 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 1767 PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name); 1768 } 1769 1770 /* Copy A's value to fact */ 1771 m = fact->rmap->n; 1772 nz = aij->nz; 1773 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1774 Acsr = (CsrMatrix*)Acusp->mat->mat; 1775 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal,Acsr->values->data().get(),sizeof(PetscScalar)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream)); 1776 1777 /* Factorize fact inplace */ 1778 if (m) PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1779 fs->matDescr_M, 1780 fs->csrVal, 1781 fs->csrRowPtr, 1782 fs->csrColIdx, 1783 fs->ilu0Info_M, 1784 fs->policy_M, 1785 fs->factBuffer_M)); 1786 if (PetscDefined(USE_DEBUG)) { 1787 int numerical_zero; 1788 cusparseStatus_t status; 1789 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero); 1790 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Numerical zero pivot detected in csrilu02: A(%d,%d) is zero",numerical_zero,numerical_zero); 1791 } 1792 1793 /* From my experiment, cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02() */ 1794 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, 1795 CUSPARSE_OPERATION_NON_TRANSPOSE, 1796 &PETSC_CUSPARSE_ONE, 1797 fs->spMatDescr_L, 1798 fs->dnVecDescr_X, 1799 fs->dnVecDescr_Y, 1800 cusparse_scalartype, 1801 CUSPARSE_SPSV_ALG_DEFAULT, 1802 fs->spsvDescr_L, 1803 fs->spsvBuffer_L)); 1804 1805 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, 1806 CUSPARSE_OPERATION_NON_TRANSPOSE, 1807 &PETSC_CUSPARSE_ONE, 1808 fs->spMatDescr_U, 1809 fs->dnVecDescr_X, 1810 fs->dnVecDescr_Y, 1811 cusparse_scalartype, 1812 CUSPARSE_SPSV_ALG_DEFAULT, 1813 fs->spsvDescr_U, 1814 fs->spsvBuffer_U)); 1815 1816 fact->offloadmask = PETSC_OFFLOAD_GPU; 1817 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ILU0; 1818 fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_ILU0; 1819 fact->ops->matsolve = NULL; 1820 fact->ops->matsolvetranspose = NULL; 1821 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1822 PetscFunctionReturn(0); 1823 } 1824 1825 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1826 { 1827 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr; 1828 Mat_SeqAIJ *aij = (Mat_SeqAIJ*)fact->data; 1829 PetscInt m,nz; 1830 1831 PetscFunctionBegin; 1832 if (PetscDefined(USE_DEBUG)) { 1833 PetscInt i; 1834 PetscBool flg,missing; 1835 1836 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 1837 PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name); 1838 PetscCheck(A->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT,A->rmap->n,A->cmap->n); 1839 PetscCall(MatMissingDiagonal(A,&missing,&i)); 1840 PetscCheck(!missing,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %" PetscInt_FMT,i); 1841 } 1842 1843 /* Free the old stale stuff */ 1844 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 1845 1846 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1847 but they will not be used. Allocate them just for easy debugging. 1848 */ 1849 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE/*malloc*/)); 1850 1851 fact->offloadmask = PETSC_OFFLOAD_BOTH; 1852 fact->factortype = MAT_FACTOR_ILU; 1853 fact->info.factor_mallocs = 0; 1854 fact->info.fill_ratio_given = info->fill; 1855 fact->info.fill_ratio_needed = 1.0; 1856 1857 aij->row = NULL; 1858 aij->col = NULL; 1859 1860 /* ====================================================================== */ 1861 /* Copy A's i, j to fact and also allocate the value array of fact. */ 1862 /* We'll do in-place factorization on fact */ 1863 /* ====================================================================== */ 1864 const int *Ai,*Aj; 1865 1866 m = fact->rmap->n; 1867 nz = aij->nz; 1868 1869 PetscCallCUDA(cudaMalloc((void**)&fs->csrRowPtr,sizeof(int)*(m+1))); 1870 PetscCallCUDA(cudaMalloc((void**)&fs->csrColIdx,sizeof(int)*nz)); 1871 PetscCallCUDA(cudaMalloc((void**)&fs->csrVal,sizeof(PetscScalar)*nz)); 1872 PetscCall(MatSeqAIJCUSPARSEGetIJ(A,PETSC_FALSE,&Ai,&Aj)); /* Do not use compressed Ai */ 1873 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr,Ai,sizeof(int)*(m+1),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream)); 1874 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx,Aj,sizeof(int)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream)); 1875 1876 /* ====================================================================== */ 1877 /* Create descriptors for M, L, U */ 1878 /* ====================================================================== */ 1879 cusparseFillMode_t fillMode; 1880 cusparseDiagType_t diagType; 1881 1882 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 1883 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 1884 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 1885 1886 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 1887 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1888 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1889 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1890 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1891 */ 1892 fillMode = CUSPARSE_FILL_MODE_LOWER; 1893 diagType = CUSPARSE_DIAG_TYPE_UNIT; 1894 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L,m,m,nz, 1895 fs->csrRowPtr, 1896 fs->csrColIdx, 1897 fs->csrVal, 1898 CUSPARSE_INDEX_32I, 1899 CUSPARSE_INDEX_32I, 1900 CUSPARSE_INDEX_BASE_ZERO, 1901 cusparse_scalartype)); 1902 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, 1903 CUSPARSE_SPMAT_FILL_MODE, 1904 &fillMode, 1905 sizeof(fillMode))); 1906 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, 1907 CUSPARSE_SPMAT_DIAG_TYPE, 1908 &diagType, 1909 sizeof(diagType))); 1910 1911 fillMode = CUSPARSE_FILL_MODE_UPPER; 1912 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 1913 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U,m,m,nz, 1914 fs->csrRowPtr, 1915 fs->csrColIdx, 1916 fs->csrVal, 1917 CUSPARSE_INDEX_32I, 1918 CUSPARSE_INDEX_32I, 1919 CUSPARSE_INDEX_BASE_ZERO, 1920 cusparse_scalartype)); 1921 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, 1922 CUSPARSE_SPMAT_FILL_MODE, 1923 &fillMode, 1924 sizeof(fillMode))); 1925 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, 1926 CUSPARSE_SPMAT_DIAG_TYPE, 1927 &diagType, 1928 sizeof(diagType))); 1929 1930 /* ========================================================================= */ 1931 /* Query buffer sizes for csrilu0, SpSV and allocate buffers */ 1932 /* ========================================================================= */ 1933 PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M)); 1934 if (m) PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1935 fs->matDescr_M, 1936 fs->csrVal, 1937 fs->csrRowPtr, 1938 fs->csrColIdx, 1939 fs->ilu0Info_M, 1940 &fs->factBufferSize_M)); 1941 1942 PetscCallCUDA(cudaMalloc((void**)&fs->X,sizeof(PetscScalar)*m)); 1943 PetscCallCUDA(cudaMalloc((void**)&fs->Y,sizeof(PetscScalar)*m)); 1944 1945 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X,m,fs->X,cusparse_scalartype)); 1946 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y,m,fs->Y,cusparse_scalartype)); 1947 1948 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 1949 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, 1950 CUSPARSE_OPERATION_NON_TRANSPOSE, 1951 &PETSC_CUSPARSE_ONE, 1952 fs->spMatDescr_L, 1953 fs->dnVecDescr_X, 1954 fs->dnVecDescr_Y, 1955 cusparse_scalartype, 1956 CUSPARSE_SPSV_ALG_DEFAULT, 1957 fs->spsvDescr_L, 1958 &fs->spsvBufferSize_L)); 1959 1960 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 1961 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, 1962 CUSPARSE_OPERATION_NON_TRANSPOSE, 1963 &PETSC_CUSPARSE_ONE, 1964 fs->spMatDescr_U, 1965 fs->dnVecDescr_X, 1966 fs->dnVecDescr_Y, 1967 cusparse_scalartype, 1968 CUSPARSE_SPSV_ALG_DEFAULT, 1969 fs->spsvDescr_U, 1970 &fs->spsvBufferSize_U)); 1971 1972 /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab, 1973 spsvBuffer_L and spsvBuffer_U can not be shared. 1974 */ 1975 PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_U,fs->spsvBufferSize_U)); 1976 PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_L,fs->spsvBufferSize_L)); 1977 PetscCallCUDA(cudaMalloc((void**)&fs->factBuffer_M,fs->factBufferSize_M)); 1978 1979 /* ========================================================================== */ 1980 /* Perform analysis of ilu0 on M, SpSv on L and U */ 1981 /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/ 1982 /* ========================================================================== */ 1983 int structural_zero; 1984 cusparseStatus_t status; 1985 1986 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1987 if (m) PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1988 fs->matDescr_M, 1989 fs->csrVal, 1990 fs->csrRowPtr, 1991 fs->csrColIdx, 1992 fs->ilu0Info_M, 1993 fs->policy_M, 1994 fs->factBuffer_M)); 1995 if (PetscDefined(USE_DEBUG)) { 1996 /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 1997 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero); 1998 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Structural zero pivot detected in csrilu02: A(%d,%d) is missing",structural_zero,structural_zero); 1999 } 2000 2001 /* Estimate FLOPs of the numeric factorization */ 2002 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ*)A->data; 2003 PetscInt *Adiag,nzRow,nzLeft; 2004 PetscLogDouble flops = 0.0; 2005 2006 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 2007 Ai = Aseq->i; 2008 Adiag = Aseq->diag; 2009 for (PetscInt i=0; i<m; i++) { 2010 if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i+1]) { /* There are nonzeros left to the diagonal of row i */ 2011 nzRow = Ai[i+1] - Ai[i]; 2012 nzLeft = Adiag[i] - Ai[i]; 2013 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 2014 and include the eliminated one will be updated, which incurs a multiplication and an addition. 2015 */ 2016 nzLeft = (nzRow-1)/2; 2017 flops += nzLeft*(2.0*nzRow-nzLeft+1); 2018 } 2019 } 2020 fs->numericFactFlops = flops; 2021 fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0; 2022 PetscFunctionReturn(0); 2023 } 2024 2025 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact,Vec b,Vec x) 2026 { 2027 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr; 2028 Mat_SeqAIJ *aij = (Mat_SeqAIJ*)fact->data; 2029 const PetscScalar *barray; 2030 PetscScalar *xarray; 2031 2032 PetscFunctionBegin; 2033 PetscCall(VecCUDAGetArrayWrite(x,&xarray)); 2034 PetscCall(VecCUDAGetArrayRead(b,&barray)); 2035 PetscCall(PetscLogGpuTimeBegin()); 2036 2037 /* Solve L*y = b */ 2038 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,(void*)barray)); 2039 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y,fs->Y)); 2040 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, 2041 CUSPARSE_OPERATION_NON_TRANSPOSE, 2042 &PETSC_CUSPARSE_ONE, 2043 fs->spMatDescr_L, /* L Y = X */ 2044 fs->dnVecDescr_X, 2045 fs->dnVecDescr_Y, 2046 cusparse_scalartype, 2047 CUSPARSE_SPSV_ALG_DEFAULT, 2048 fs->spsvDescr_L)); 2049 2050 /* Solve Lt*x = y */ 2051 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,xarray)); 2052 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, 2053 CUSPARSE_OPERATION_TRANSPOSE, 2054 &PETSC_CUSPARSE_ONE, 2055 fs->spMatDescr_L, /* Lt X = Y */ 2056 fs->dnVecDescr_Y, 2057 fs->dnVecDescr_X, 2058 cusparse_scalartype, 2059 CUSPARSE_SPSV_ALG_DEFAULT, 2060 fs->spsvDescr_Lt)); 2061 2062 PetscCall(VecCUDARestoreArrayRead(b,&barray)); 2063 PetscCall(VecCUDARestoreArrayWrite(x,&xarray)); 2064 2065 PetscCall(PetscLogGpuTimeEnd()); 2066 PetscCall(PetscLogGpuFlops(2.0*aij->nz - fact->rmap->n)); 2067 PetscFunctionReturn(0); 2068 } 2069 2070 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact,Mat A,const MatFactorInfo *info) 2071 { 2072 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr; 2073 Mat_SeqAIJ *aij = (Mat_SeqAIJ*)fact->data; 2074 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2075 CsrMatrix *Acsr; 2076 PetscInt m,nz; 2077 PetscBool flg; 2078 2079 PetscFunctionBegin; 2080 if (PetscDefined(USE_DEBUG)) { 2081 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2082 PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name); 2083 } 2084 2085 /* Copy A's value to fact */ 2086 m = fact->rmap->n; 2087 nz = aij->nz; 2088 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2089 Acsr = (CsrMatrix*)Acusp->mat->mat; 2090 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal,Acsr->values->data().get(),sizeof(PetscScalar)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream)); 2091 2092 /* Factorize fact inplace */ 2093 /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve 2094 Function csric02() only takes the lower triangular part of matrix A to perform factorization. 2095 The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored, 2096 and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not. 2097 In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided. 2098 */ 2099 if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, 2100 fs->matDescr_M, 2101 fs->csrVal, 2102 fs->csrRowPtr, 2103 fs->csrColIdx, 2104 fs->ic0Info_M, 2105 fs->policy_M, 2106 fs->factBuffer_M)); 2107 if (PetscDefined(USE_DEBUG)) { 2108 int numerical_zero; 2109 cusparseStatus_t status; 2110 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero); 2111 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Numerical zero pivot detected in csric02: A(%d,%d) is zero",numerical_zero,numerical_zero); 2112 } 2113 2114 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, 2115 CUSPARSE_OPERATION_NON_TRANSPOSE, 2116 &PETSC_CUSPARSE_ONE, 2117 fs->spMatDescr_L, 2118 fs->dnVecDescr_X, 2119 fs->dnVecDescr_Y, 2120 cusparse_scalartype, 2121 CUSPARSE_SPSV_ALG_DEFAULT, 2122 fs->spsvDescr_L, 2123 fs->spsvBuffer_L)); 2124 2125 /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE 2126 ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F 2127 */ 2128 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, 2129 CUSPARSE_OPERATION_TRANSPOSE, 2130 &PETSC_CUSPARSE_ONE, 2131 fs->spMatDescr_L, 2132 fs->dnVecDescr_X, 2133 fs->dnVecDescr_Y, 2134 cusparse_scalartype, 2135 CUSPARSE_SPSV_ALG_DEFAULT, 2136 fs->spsvDescr_Lt, 2137 fs->spsvBuffer_Lt)); 2138 2139 fact->offloadmask = PETSC_OFFLOAD_GPU; 2140 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0; 2141 fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0; 2142 fact->ops->matsolve = NULL; 2143 fact->ops->matsolvetranspose = NULL; 2144 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 2145 PetscFunctionReturn(0); 2146 } 2147 2148 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact,Mat A,IS perm,const MatFactorInfo *info) 2149 { 2150 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr; 2151 Mat_SeqAIJ *aij = (Mat_SeqAIJ*)fact->data; 2152 PetscInt m,nz; 2153 2154 PetscFunctionBegin; 2155 if (PetscDefined(USE_DEBUG)) { 2156 PetscInt i; 2157 PetscBool flg,missing; 2158 2159 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2160 PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name); 2161 PetscCheck(A->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT,A->rmap->n,A->cmap->n); 2162 PetscCall(MatMissingDiagonal(A,&missing,&i)); 2163 PetscCheck(!missing,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %" PetscInt_FMT,i); 2164 } 2165 2166 /* Free the old stale stuff */ 2167 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 2168 2169 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 2170 but they will not be used. Allocate them just for easy debugging. 2171 */ 2172 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE/*malloc*/)); 2173 2174 fact->offloadmask = PETSC_OFFLOAD_BOTH; 2175 fact->factortype = MAT_FACTOR_ICC; 2176 fact->info.factor_mallocs = 0; 2177 fact->info.fill_ratio_given = info->fill; 2178 fact->info.fill_ratio_needed = 1.0; 2179 2180 aij->row = NULL; 2181 aij->col = NULL; 2182 2183 /* ====================================================================== */ 2184 /* Copy A's i, j to fact and also allocate the value array of fact. */ 2185 /* We'll do in-place factorization on fact */ 2186 /* ====================================================================== */ 2187 const int *Ai,*Aj; 2188 2189 m = fact->rmap->n; 2190 nz = aij->nz; 2191 2192 PetscCallCUDA(cudaMalloc((void**)&fs->csrRowPtr,sizeof(int)*(m+1))); 2193 PetscCallCUDA(cudaMalloc((void**)&fs->csrColIdx,sizeof(int)*nz)); 2194 PetscCallCUDA(cudaMalloc((void**)&fs->csrVal,sizeof(PetscScalar)*nz)); 2195 PetscCall(MatSeqAIJCUSPARSEGetIJ(A,PETSC_FALSE,&Ai,&Aj)); /* Do not use compressed Ai */ 2196 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr,Ai,sizeof(int)*(m+1),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream)); 2197 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx,Aj,sizeof(int)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream)); 2198 2199 /* ====================================================================== */ 2200 /* Create mat descriptors for M, L */ 2201 /* ====================================================================== */ 2202 cusparseFillMode_t fillMode; 2203 cusparseDiagType_t diagType; 2204 2205 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 2206 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 2207 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 2208 2209 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 2210 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 2211 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 2212 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 2213 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 2214 */ 2215 fillMode = CUSPARSE_FILL_MODE_LOWER; 2216 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 2217 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L,m,m,nz, 2218 fs->csrRowPtr, 2219 fs->csrColIdx, 2220 fs->csrVal, 2221 CUSPARSE_INDEX_32I, 2222 CUSPARSE_INDEX_32I, 2223 CUSPARSE_INDEX_BASE_ZERO, 2224 cusparse_scalartype)); 2225 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, 2226 CUSPARSE_SPMAT_FILL_MODE, 2227 &fillMode, 2228 sizeof(fillMode))); 2229 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, 2230 CUSPARSE_SPMAT_DIAG_TYPE, 2231 &diagType, 2232 sizeof(diagType))); 2233 2234 /* ========================================================================= */ 2235 /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */ 2236 /* ========================================================================= */ 2237 PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M)); 2238 if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, 2239 fs->matDescr_M, 2240 fs->csrVal, 2241 fs->csrRowPtr, 2242 fs->csrColIdx, 2243 fs->ic0Info_M, 2244 &fs->factBufferSize_M)); 2245 2246 PetscCallCUDA(cudaMalloc((void**)&fs->X,sizeof(PetscScalar)*m)); 2247 PetscCallCUDA(cudaMalloc((void**)&fs->Y,sizeof(PetscScalar)*m)); 2248 2249 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X,m,fs->X,cusparse_scalartype)); 2250 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y,m,fs->Y,cusparse_scalartype)); 2251 2252 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 2253 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, 2254 CUSPARSE_OPERATION_NON_TRANSPOSE, 2255 &PETSC_CUSPARSE_ONE, 2256 fs->spMatDescr_L, 2257 fs->dnVecDescr_X, 2258 fs->dnVecDescr_Y, 2259 cusparse_scalartype, 2260 CUSPARSE_SPSV_ALG_DEFAULT, 2261 fs->spsvDescr_L, 2262 &fs->spsvBufferSize_L)); 2263 2264 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 2265 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, 2266 CUSPARSE_OPERATION_TRANSPOSE, 2267 &PETSC_CUSPARSE_ONE, 2268 fs->spMatDescr_L, 2269 fs->dnVecDescr_X, 2270 fs->dnVecDescr_Y, 2271 cusparse_scalartype, 2272 CUSPARSE_SPSV_ALG_DEFAULT, 2273 fs->spsvDescr_Lt, 2274 &fs->spsvBufferSize_Lt)); 2275 2276 PetscCallCUDA(cudaMalloc((void**)&fs->factBuffer_M,fs->factBufferSize_M)); 2277 PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_L,fs->spsvBufferSize_L)); 2278 PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_Lt,fs->spsvBufferSize_Lt)); 2279 2280 /* ========================================================================== */ 2281 /* Perform analysis of ic0 on M */ 2282 /* The lower triangular part of M has the same sparsity pattern as L */ 2283 /* ========================================================================== */ 2284 int structural_zero; 2285 cusparseStatus_t status; 2286 2287 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 2288 if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, 2289 fs->matDescr_M, 2290 fs->csrVal, 2291 fs->csrRowPtr, 2292 fs->csrColIdx, 2293 fs->ic0Info_M, 2294 fs->policy_M, 2295 fs->factBuffer_M)); 2296 if (PetscDefined(USE_DEBUG)) { 2297 /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 2298 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero); 2299 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Structural zero pivot detected in csric02: A(%d,%d) is missing",structural_zero,structural_zero); 2300 } 2301 2302 /* Estimate FLOPs of the numeric factorization */ 2303 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ*)A->data; 2304 PetscInt nzRow,nzLeft; 2305 PetscLogDouble flops = 0.0; 2306 2307 Ai = Aseq->i; 2308 for (PetscInt i=0; i<m; i++) { 2309 nzRow = Ai[i+1] - Ai[i]; 2310 if (nzRow > 1) { 2311 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 2312 and include the eliminated one will be updated, which incurs a multiplication and an addition. 2313 */ 2314 nzLeft = (nzRow-1)/2; 2315 flops += nzLeft*(2.0*nzRow-nzLeft+1); 2316 } 2317 } 2318 fs->numericFactFlops = flops; 2319 fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0; 2320 PetscFunctionReturn(0); 2321 } 2322 #endif 2323 2324 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 2325 { 2326 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 2327 2328 PetscFunctionBegin; 2329 #if CUSPARSE_VERSION >= 11500 2330 PetscBool row_identity,col_identity; 2331 PetscCall(ISIdentity(isrow,&row_identity)); 2332 PetscCall(ISIdentity(iscol,&col_identity)); 2333 if (!info->levels && row_identity && col_identity) { 2334 PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B,A,isrow,iscol,info)); 2335 } else 2336 #endif 2337 { 2338 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2339 PetscCall(MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 2340 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2341 } 2342 PetscFunctionReturn(0); 2343 } 2344 2345 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 2346 { 2347 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 2348 2349 PetscFunctionBegin; 2350 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2351 PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 2352 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2353 PetscFunctionReturn(0); 2354 } 2355 2356 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 2357 { 2358 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 2359 2360 PetscFunctionBegin; 2361 #if CUSPARSE_VERSION >= 11500 2362 PetscBool perm_identity; 2363 PetscCall(ISIdentity(perm,&perm_identity)); 2364 if (!info->levels && perm_identity) { 2365 PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B,A,perm,info)); 2366 } else 2367 #endif 2368 { 2369 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2370 PetscCall(MatICCFactorSymbolic_SeqAIJ(B,A,perm,info)); 2371 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2372 } 2373 PetscFunctionReturn(0); 2374 } 2375 2376 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 2377 { 2378 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 2379 2380 PetscFunctionBegin; 2381 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2382 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info)); 2383 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2384 PetscFunctionReturn(0); 2385 } 2386 2387 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 2388 { 2389 PetscFunctionBegin; 2390 *type = MATSOLVERCUSPARSE; 2391 PetscFunctionReturn(0); 2392 } 2393 2394 /*MC 2395 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 2396 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 2397 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 2398 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 2399 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 2400 algorithms are not recommended. This class does NOT support direct solver operations. 2401 2402 Level: beginner 2403 2404 .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 2405 M*/ 2406 2407 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 2408 { 2409 PetscInt n = A->rmap->n; 2410 2411 PetscFunctionBegin; 2412 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B)); 2413 PetscCall(MatSetSizes(*B,n,n,n,n)); 2414 (*B)->factortype = ftype; 2415 PetscCall(MatSetType(*B,MATSEQAIJCUSPARSE)); 2416 2417 if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B,PETSC_TRUE)); 2418 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2419 PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 2420 if (!A->boundtocpu) { 2421 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 2422 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 2423 } else { 2424 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 2425 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 2426 } 2427 PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU])); 2428 PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2429 PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2430 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2431 if (!A->boundtocpu) { 2432 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 2433 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 2434 } else { 2435 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 2436 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 2437 } 2438 PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2439 PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2440 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 2441 2442 PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL)); 2443 (*B)->canuseordering = PETSC_TRUE; 2444 PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse)); 2445 PetscFunctionReturn(0); 2446 } 2447 2448 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 2449 { 2450 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2451 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2452 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 2453 2454 PetscFunctionBegin; 2455 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 2456 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0)); 2457 if (A->factortype == MAT_FACTOR_NONE) { 2458 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 2459 PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2460 } 2461 #if CUSPARSE_VERSION >= 13500 2462 else if (fs->csrVal) { 2463 /* We have a factorized matrix on device and are able to copy it to host */ 2464 PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2465 } 2466 #endif 2467 else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for copying this type of factorized matrix from device to host"); 2468 PetscCall(PetscLogGpuToCpu(a->nz*sizeof(PetscScalar))); 2469 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0)); 2470 A->offloadmask = PETSC_OFFLOAD_BOTH; 2471 } 2472 PetscFunctionReturn(0); 2473 } 2474 2475 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 2476 { 2477 PetscFunctionBegin; 2478 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2479 *array = ((Mat_SeqAIJ*)A->data)->a; 2480 PetscFunctionReturn(0); 2481 } 2482 2483 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 2484 { 2485 PetscFunctionBegin; 2486 A->offloadmask = PETSC_OFFLOAD_CPU; 2487 *array = NULL; 2488 PetscFunctionReturn(0); 2489 } 2490 2491 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 2492 { 2493 PetscFunctionBegin; 2494 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2495 *array = ((Mat_SeqAIJ*)A->data)->a; 2496 PetscFunctionReturn(0); 2497 } 2498 2499 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 2500 { 2501 PetscFunctionBegin; 2502 *array = NULL; 2503 PetscFunctionReturn(0); 2504 } 2505 2506 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 2507 { 2508 PetscFunctionBegin; 2509 *array = ((Mat_SeqAIJ*)A->data)->a; 2510 PetscFunctionReturn(0); 2511 } 2512 2513 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 2514 { 2515 PetscFunctionBegin; 2516 A->offloadmask = PETSC_OFFLOAD_CPU; 2517 *array = NULL; 2518 PetscFunctionReturn(0); 2519 } 2520 2521 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype) 2522 { 2523 Mat_SeqAIJCUSPARSE *cusp; 2524 CsrMatrix *matrix; 2525 2526 PetscFunctionBegin; 2527 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2528 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 2529 cusp = static_cast<Mat_SeqAIJCUSPARSE*>(A->spptr); 2530 PetscCheck(cusp != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"cusp is NULL"); 2531 matrix = (CsrMatrix*)cusp->mat->mat; 2532 2533 if (i) { 2534 #if !defined(PETSC_USE_64BIT_INDICES) 2535 *i = matrix->row_offsets->data().get(); 2536 #else 2537 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices"); 2538 #endif 2539 } 2540 if (j) { 2541 #if !defined(PETSC_USE_64BIT_INDICES) 2542 *j = matrix->column_indices->data().get(); 2543 #else 2544 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices"); 2545 #endif 2546 } 2547 if (a) *a = matrix->values->data().get(); 2548 if (mtype) *mtype = PETSC_MEMTYPE_CUDA; 2549 PetscFunctionReturn(0); 2550 } 2551 2552 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 2553 { 2554 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2555 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 2556 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2557 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 2558 cusparseStatus_t stat; 2559 PetscBool both = PETSC_TRUE; 2560 2561 PetscFunctionBegin; 2562 PetscCheck(!A->boundtocpu,PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU"); 2563 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 2564 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 2565 CsrMatrix *matrix; 2566 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 2567 2568 PetscCheck(!a->nz || a->a,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values"); 2569 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0)); 2570 matrix->values->assign(a->a, a->a+a->nz); 2571 PetscCallCUDA(WaitForCUDA()); 2572 PetscCall(PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar))); 2573 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0)); 2574 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 2575 } else { 2576 PetscInt nnz; 2577 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0)); 2578 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format)); 2579 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 2580 delete cusparsestruct->workVector; 2581 delete cusparsestruct->rowoffsets_gpu; 2582 cusparsestruct->workVector = NULL; 2583 cusparsestruct->rowoffsets_gpu = NULL; 2584 try { 2585 if (a->compressedrow.use) { 2586 m = a->compressedrow.nrows; 2587 ii = a->compressedrow.i; 2588 ridx = a->compressedrow.rindex; 2589 } else { 2590 m = A->rmap->n; 2591 ii = a->i; 2592 ridx = NULL; 2593 } 2594 PetscCheck(ii,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data"); 2595 if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 2596 else nnz = a->nz; 2597 PetscCheck(!nnz || a->j,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data"); 2598 2599 /* create cusparse matrix */ 2600 cusparsestruct->nrows = m; 2601 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 2602 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr)); 2603 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO)); 2604 PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 2605 2606 PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar))); 2607 PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar))); 2608 PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar))); 2609 PetscCallCUDA(cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 2610 PetscCallCUDA(cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 2611 PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 2612 PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2613 2614 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 2615 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 2616 /* set the matrix */ 2617 CsrMatrix *mat= new CsrMatrix; 2618 mat->num_rows = m; 2619 mat->num_cols = A->cmap->n; 2620 mat->num_entries = nnz; 2621 mat->row_offsets = new THRUSTINTARRAY32(m+1); 2622 mat->row_offsets->assign(ii, ii + m+1); 2623 2624 mat->column_indices = new THRUSTINTARRAY32(nnz); 2625 mat->column_indices->assign(a->j, a->j+nnz); 2626 2627 mat->values = new THRUSTARRAY(nnz); 2628 if (a->a) mat->values->assign(a->a, a->a+nnz); 2629 2630 /* assign the pointer */ 2631 matstruct->mat = mat; 2632 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2633 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 2634 stat = cusparseCreateCsr(&matstruct->matDescr, 2635 mat->num_rows, mat->num_cols, mat->num_entries, 2636 mat->row_offsets->data().get(), mat->column_indices->data().get(), 2637 mat->values->data().get(), 2638 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2639 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat); 2640 } 2641 #endif 2642 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 2643 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2644 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2645 #else 2646 CsrMatrix *mat= new CsrMatrix; 2647 mat->num_rows = m; 2648 mat->num_cols = A->cmap->n; 2649 mat->num_entries = nnz; 2650 mat->row_offsets = new THRUSTINTARRAY32(m+1); 2651 mat->row_offsets->assign(ii, ii + m+1); 2652 2653 mat->column_indices = new THRUSTINTARRAY32(nnz); 2654 mat->column_indices->assign(a->j, a->j+nnz); 2655 2656 mat->values = new THRUSTARRAY(nnz); 2657 if (a->a) mat->values->assign(a->a, a->a+nnz); 2658 2659 cusparseHybMat_t hybMat; 2660 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 2661 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 2662 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 2663 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 2664 matstruct->descr, mat->values->data().get(), 2665 mat->row_offsets->data().get(), 2666 mat->column_indices->data().get(), 2667 hybMat, 0, partition);PetscCallCUSPARSE(stat); 2668 /* assign the pointer */ 2669 matstruct->mat = hybMat; 2670 2671 if (mat) { 2672 if (mat->values) delete (THRUSTARRAY*)mat->values; 2673 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 2674 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 2675 delete (CsrMatrix*)mat; 2676 } 2677 #endif 2678 } 2679 2680 /* assign the compressed row indices */ 2681 if (a->compressedrow.use) { 2682 cusparsestruct->workVector = new THRUSTARRAY(m); 2683 matstruct->cprowIndices = new THRUSTINTARRAY(m); 2684 matstruct->cprowIndices->assign(ridx,ridx+m); 2685 tmp = m; 2686 } else { 2687 cusparsestruct->workVector = NULL; 2688 matstruct->cprowIndices = NULL; 2689 tmp = 0; 2690 } 2691 PetscCall(PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar))); 2692 2693 /* assign the pointer */ 2694 cusparsestruct->mat = matstruct; 2695 } catch(char *ex) { 2696 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 2697 } 2698 PetscCallCUDA(WaitForCUDA()); 2699 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0)); 2700 cusparsestruct->nonzerostate = A->nonzerostate; 2701 } 2702 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 2703 } 2704 PetscFunctionReturn(0); 2705 } 2706 2707 struct VecCUDAPlusEquals 2708 { 2709 template <typename Tuple> 2710 __host__ __device__ 2711 void operator()(Tuple t) 2712 { 2713 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 2714 } 2715 }; 2716 2717 struct VecCUDAEquals 2718 { 2719 template <typename Tuple> 2720 __host__ __device__ 2721 void operator()(Tuple t) 2722 { 2723 thrust::get<1>(t) = thrust::get<0>(t); 2724 } 2725 }; 2726 2727 struct VecCUDAEqualsReverse 2728 { 2729 template <typename Tuple> 2730 __host__ __device__ 2731 void operator()(Tuple t) 2732 { 2733 thrust::get<0>(t) = thrust::get<1>(t); 2734 } 2735 }; 2736 2737 struct MatMatCusparse { 2738 PetscBool cisdense; 2739 PetscScalar *Bt; 2740 Mat X; 2741 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2742 PetscLogDouble flops; 2743 CsrMatrix *Bcsr; 2744 2745 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2746 cusparseSpMatDescr_t matSpBDescr; 2747 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2748 cusparseDnMatDescr_t matBDescr; 2749 cusparseDnMatDescr_t matCDescr; 2750 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 2751 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2752 void *dBuffer4; 2753 void *dBuffer5; 2754 #endif 2755 size_t mmBufferSize; 2756 void *mmBuffer; 2757 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2758 cusparseSpGEMMDescr_t spgemmDesc; 2759 #endif 2760 }; 2761 2762 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2763 { 2764 MatMatCusparse *mmdata = (MatMatCusparse *)data; 2765 2766 PetscFunctionBegin; 2767 PetscCallCUDA(cudaFree(mmdata->Bt)); 2768 delete mmdata->Bcsr; 2769 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2770 if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr)); 2771 if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2772 if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2773 if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2774 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2775 if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4)); 2776 if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5)); 2777 #endif 2778 if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2779 if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2)); 2780 #endif 2781 PetscCall(MatDestroy(&mmdata->X)); 2782 PetscCall(PetscFree(data)); 2783 PetscFunctionReturn(0); 2784 } 2785 2786 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 2787 2788 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2789 { 2790 Mat_Product *product = C->product; 2791 Mat A,B; 2792 PetscInt m,n,blda,clda; 2793 PetscBool flg,biscuda; 2794 Mat_SeqAIJCUSPARSE *cusp; 2795 cusparseStatus_t stat; 2796 cusparseOperation_t opA; 2797 const PetscScalar *barray; 2798 PetscScalar *carray; 2799 MatMatCusparse *mmdata; 2800 Mat_SeqAIJCUSPARSEMultStruct *mat; 2801 CsrMatrix *csrmat; 2802 2803 PetscFunctionBegin; 2804 MatCheckProduct(C,1); 2805 PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2806 mmdata = (MatMatCusparse*)product->data; 2807 A = product->A; 2808 B = product->B; 2809 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2810 PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2811 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2812 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2813 PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2814 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2815 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2816 switch (product->type) { 2817 case MATPRODUCT_AB: 2818 case MATPRODUCT_PtAP: 2819 mat = cusp->mat; 2820 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2821 m = A->rmap->n; 2822 n = B->cmap->n; 2823 break; 2824 case MATPRODUCT_AtB: 2825 if (!A->form_explicit_transpose) { 2826 mat = cusp->mat; 2827 opA = CUSPARSE_OPERATION_TRANSPOSE; 2828 } else { 2829 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2830 mat = cusp->matTranspose; 2831 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2832 } 2833 m = A->cmap->n; 2834 n = B->cmap->n; 2835 break; 2836 case MATPRODUCT_ABt: 2837 case MATPRODUCT_RARt: 2838 mat = cusp->mat; 2839 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2840 m = A->rmap->n; 2841 n = B->rmap->n; 2842 break; 2843 default: 2844 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2845 } 2846 PetscCheck(mat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2847 csrmat = (CsrMatrix*)mat->mat; 2848 /* if the user passed a CPU matrix, copy the data to the GPU */ 2849 PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda)); 2850 if (!biscuda) PetscCall(MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B)); 2851 PetscCall(MatDenseCUDAGetArrayRead(B,&barray)); 2852 2853 PetscCall(MatDenseGetLDA(B,&blda)); 2854 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2855 PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X,&carray)); 2856 PetscCall(MatDenseGetLDA(mmdata->X,&clda)); 2857 } else { 2858 PetscCall(MatDenseCUDAGetArrayWrite(C,&carray)); 2859 PetscCall(MatDenseGetLDA(C,&clda)); 2860 } 2861 2862 PetscCall(PetscLogGpuTimeBegin()); 2863 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2864 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2865 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2866 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2867 size_t mmBufferSize; 2868 if (mmdata->initialized && mmdata->Blda != blda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); mmdata->matBDescr = NULL;} 2869 if (!mmdata->matBDescr) { 2870 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL)); 2871 mmdata->Blda = blda; 2872 } 2873 2874 if (mmdata->initialized && mmdata->Clda != clda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); mmdata->matCDescr = NULL;} 2875 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2876 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL)); 2877 mmdata->Clda = clda; 2878 } 2879 2880 if (!mat->matDescr) { 2881 stat = cusparseCreateCsr(&mat->matDescr, 2882 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2883 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2884 csrmat->values->data().get(), 2885 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2886 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat); 2887 } 2888 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2889 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2890 mmdata->matCDescr,cusparse_scalartype, 2891 cusp->spmmAlg,&mmBufferSize);PetscCallCUSPARSE(stat); 2892 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2893 PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2894 PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer,mmBufferSize)); 2895 mmdata->mmBufferSize = mmBufferSize; 2896 } 2897 mmdata->initialized = PETSC_TRUE; 2898 } else { 2899 /* to be safe, always update pointers of the mats */ 2900 PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get())); 2901 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray)); 2902 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray)); 2903 } 2904 2905 /* do cusparseSpMM, which supports transpose on B */ 2906 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2907 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2908 mmdata->matCDescr,cusparse_scalartype, 2909 cusp->spmmAlg,mmdata->mmBuffer);PetscCallCUSPARSE(stat); 2910 #else 2911 PetscInt k; 2912 /* cusparseXcsrmm does not support transpose on B */ 2913 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2914 cublasHandle_t cublasv2handle; 2915 cublasStatus_t cerr; 2916 2917 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 2918 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2919 B->cmap->n,B->rmap->n, 2920 &PETSC_CUSPARSE_ONE ,barray,blda, 2921 &PETSC_CUSPARSE_ZERO,barray,blda, 2922 mmdata->Bt,B->cmap->n);PetscCallCUBLAS(cerr); 2923 blda = B->cmap->n; 2924 k = B->cmap->n; 2925 } else { 2926 k = B->rmap->n; 2927 } 2928 2929 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2930 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2931 csrmat->num_entries,mat->alpha_one,mat->descr, 2932 csrmat->values->data().get(), 2933 csrmat->row_offsets->data().get(), 2934 csrmat->column_indices->data().get(), 2935 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2936 carray,clda);PetscCallCUSPARSE(stat); 2937 #endif 2938 PetscCall(PetscLogGpuTimeEnd()); 2939 PetscCall(PetscLogGpuFlops(n*2.0*csrmat->num_entries)); 2940 PetscCall(MatDenseCUDARestoreArrayRead(B,&barray)); 2941 if (product->type == MATPRODUCT_RARt) { 2942 PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray)); 2943 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE)); 2944 } else if (product->type == MATPRODUCT_PtAP) { 2945 PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray)); 2946 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE)); 2947 } else { 2948 PetscCall(MatDenseCUDARestoreArrayWrite(C,&carray)); 2949 } 2950 if (mmdata->cisdense) { 2951 PetscCall(MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C)); 2952 } 2953 if (!biscuda) { 2954 PetscCall(MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B)); 2955 } 2956 PetscFunctionReturn(0); 2957 } 2958 2959 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2960 { 2961 Mat_Product *product = C->product; 2962 Mat A,B; 2963 PetscInt m,n; 2964 PetscBool cisdense,flg; 2965 MatMatCusparse *mmdata; 2966 Mat_SeqAIJCUSPARSE *cusp; 2967 2968 PetscFunctionBegin; 2969 MatCheckProduct(C,1); 2970 PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2971 A = product->A; 2972 B = product->B; 2973 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2974 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2975 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2976 PetscCheck(cusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2977 switch (product->type) { 2978 case MATPRODUCT_AB: 2979 m = A->rmap->n; 2980 n = B->cmap->n; 2981 break; 2982 case MATPRODUCT_AtB: 2983 m = A->cmap->n; 2984 n = B->cmap->n; 2985 break; 2986 case MATPRODUCT_ABt: 2987 m = A->rmap->n; 2988 n = B->rmap->n; 2989 break; 2990 case MATPRODUCT_PtAP: 2991 m = B->cmap->n; 2992 n = B->cmap->n; 2993 break; 2994 case MATPRODUCT_RARt: 2995 m = B->rmap->n; 2996 n = B->rmap->n; 2997 break; 2998 default: 2999 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 3000 } 3001 PetscCall(MatSetSizes(C,m,n,m,n)); 3002 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 3003 PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense)); 3004 PetscCall(MatSetType(C,MATSEQDENSECUDA)); 3005 3006 /* product data */ 3007 PetscCall(PetscNew(&mmdata)); 3008 mmdata->cisdense = cisdense; 3009 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 3010 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 3011 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 3012 PetscCallCUDA(cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar))); 3013 } 3014 #endif 3015 /* for these products we need intermediate storage */ 3016 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 3017 PetscCall(MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X)); 3018 PetscCall(MatSetType(mmdata->X,MATSEQDENSECUDA)); 3019 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 3020 PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n)); 3021 } else { 3022 PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n)); 3023 } 3024 } 3025 C->product->data = mmdata; 3026 C->product->destroy = MatDestroy_MatMatCusparse; 3027 3028 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 3029 PetscFunctionReturn(0); 3030 } 3031 3032 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 3033 { 3034 Mat_Product *product = C->product; 3035 Mat A,B; 3036 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 3037 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 3038 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 3039 CsrMatrix *Acsr,*Bcsr,*Ccsr; 3040 PetscBool flg; 3041 cusparseStatus_t stat; 3042 MatProductType ptype; 3043 MatMatCusparse *mmdata; 3044 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3045 cusparseSpMatDescr_t BmatSpDescr; 3046 #endif 3047 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 3048 3049 PetscFunctionBegin; 3050 MatCheckProduct(C,1); 3051 PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 3052 PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg)); 3053 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name); 3054 mmdata = (MatMatCusparse*)C->product->data; 3055 A = product->A; 3056 B = product->B; 3057 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 3058 mmdata->reusesym = PETSC_FALSE; 3059 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 3060 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 3061 Cmat = Ccusp->mat; 3062 PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 3063 Ccsr = (CsrMatrix*)Cmat->mat; 3064 PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 3065 goto finalize; 3066 } 3067 if (!c->nz) goto finalize; 3068 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 3069 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 3070 PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg)); 3071 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 3072 PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 3073 PetscCheck(!B->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 3074 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3075 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 3076 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 3077 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 3078 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 3079 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 3080 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3081 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 3082 3083 ptype = product->type; 3084 if (A->symmetric && ptype == MATPRODUCT_AtB) { 3085 ptype = MATPRODUCT_AB; 3086 PetscCheck(product->symbolic_used_the_fact_A_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric"); 3087 } 3088 if (B->symmetric && ptype == MATPRODUCT_ABt) { 3089 ptype = MATPRODUCT_AB; 3090 PetscCheck(product->symbolic_used_the_fact_B_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric"); 3091 } 3092 switch (ptype) { 3093 case MATPRODUCT_AB: 3094 Amat = Acusp->mat; 3095 Bmat = Bcusp->mat; 3096 break; 3097 case MATPRODUCT_AtB: 3098 Amat = Acusp->matTranspose; 3099 Bmat = Bcusp->mat; 3100 break; 3101 case MATPRODUCT_ABt: 3102 Amat = Acusp->mat; 3103 Bmat = Bcusp->matTranspose; 3104 break; 3105 default: 3106 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 3107 } 3108 Cmat = Ccusp->mat; 3109 PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 3110 PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 3111 PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 3112 Acsr = (CsrMatrix*)Amat->mat; 3113 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 3114 Ccsr = (CsrMatrix*)Cmat->mat; 3115 PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 3116 PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 3117 PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 3118 PetscCall(PetscLogGpuTimeBegin()); 3119 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3120 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 3121 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3122 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3123 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 3124 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3125 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 3126 mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 3127 #else 3128 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 3129 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3130 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 3131 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat); 3132 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 3133 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3134 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 3135 #endif 3136 #else 3137 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 3138 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 3139 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 3140 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 3141 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat); 3142 #endif 3143 PetscCall(PetscLogGpuFlops(mmdata->flops)); 3144 PetscCallCUDA(WaitForCUDA()); 3145 PetscCall(PetscLogGpuTimeEnd()); 3146 C->offloadmask = PETSC_OFFLOAD_GPU; 3147 finalize: 3148 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3149 PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz)); 3150 PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n")); 3151 PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax)); 3152 c->reallocs = 0; 3153 C->info.mallocs += 0; 3154 C->info.nz_unneeded = 0; 3155 C->assembled = C->was_assembled = PETSC_TRUE; 3156 C->num_ass++; 3157 PetscFunctionReturn(0); 3158 } 3159 3160 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 3161 { 3162 Mat_Product *product = C->product; 3163 Mat A,B; 3164 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 3165 Mat_SeqAIJ *a,*b,*c; 3166 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 3167 CsrMatrix *Acsr,*Bcsr,*Ccsr; 3168 PetscInt i,j,m,n,k; 3169 PetscBool flg; 3170 cusparseStatus_t stat; 3171 MatProductType ptype; 3172 MatMatCusparse *mmdata; 3173 PetscLogDouble flops; 3174 PetscBool biscompressed,ciscompressed; 3175 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3176 int64_t C_num_rows1, C_num_cols1, C_nnz1; 3177 cusparseSpMatDescr_t BmatSpDescr; 3178 #else 3179 int cnz; 3180 #endif 3181 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 3182 3183 PetscFunctionBegin; 3184 MatCheckProduct(C,1); 3185 PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 3186 A = product->A; 3187 B = product->B; 3188 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 3189 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 3190 PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg)); 3191 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 3192 a = (Mat_SeqAIJ*)A->data; 3193 b = (Mat_SeqAIJ*)B->data; 3194 /* product data */ 3195 PetscCall(PetscNew(&mmdata)); 3196 C->product->data = mmdata; 3197 C->product->destroy = MatDestroy_MatMatCusparse; 3198 3199 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3200 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 3201 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 3202 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 3203 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 3204 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 3205 3206 ptype = product->type; 3207 if (A->symmetric && ptype == MATPRODUCT_AtB) { 3208 ptype = MATPRODUCT_AB; 3209 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 3210 } 3211 if (B->symmetric && ptype == MATPRODUCT_ABt) { 3212 ptype = MATPRODUCT_AB; 3213 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 3214 } 3215 biscompressed = PETSC_FALSE; 3216 ciscompressed = PETSC_FALSE; 3217 switch (ptype) { 3218 case MATPRODUCT_AB: 3219 m = A->rmap->n; 3220 n = B->cmap->n; 3221 k = A->cmap->n; 3222 Amat = Acusp->mat; 3223 Bmat = Bcusp->mat; 3224 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3225 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3226 break; 3227 case MATPRODUCT_AtB: 3228 m = A->cmap->n; 3229 n = B->cmap->n; 3230 k = A->rmap->n; 3231 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3232 Amat = Acusp->matTranspose; 3233 Bmat = Bcusp->mat; 3234 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3235 break; 3236 case MATPRODUCT_ABt: 3237 m = A->rmap->n; 3238 n = B->rmap->n; 3239 k = A->cmap->n; 3240 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 3241 Amat = Acusp->mat; 3242 Bmat = Bcusp->matTranspose; 3243 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3244 break; 3245 default: 3246 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 3247 } 3248 3249 /* create cusparse matrix */ 3250 PetscCall(MatSetSizes(C,m,n,m,n)); 3251 PetscCall(MatSetType(C,MATSEQAIJCUSPARSE)); 3252 c = (Mat_SeqAIJ*)C->data; 3253 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 3254 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3255 Ccsr = new CsrMatrix; 3256 3257 c->compressedrow.use = ciscompressed; 3258 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 3259 c->compressedrow.nrows = a->compressedrow.nrows; 3260 PetscCall(PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex)); 3261 PetscCall(PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows)); 3262 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 3263 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 3264 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 3265 } else { 3266 c->compressedrow.nrows = 0; 3267 c->compressedrow.i = NULL; 3268 c->compressedrow.rindex = NULL; 3269 Ccusp->workVector = NULL; 3270 Cmat->cprowIndices = NULL; 3271 } 3272 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 3273 Ccusp->mat = Cmat; 3274 Ccusp->mat->mat = Ccsr; 3275 Ccsr->num_rows = Ccusp->nrows; 3276 Ccsr->num_cols = n; 3277 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 3278 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 3279 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 3280 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 3281 PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar))); 3282 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar))); 3283 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 3284 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 3285 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 3286 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 3287 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 3288 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 3289 c->nz = 0; 3290 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3291 Ccsr->values = new THRUSTARRAY(c->nz); 3292 goto finalizesym; 3293 } 3294 3295 PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 3296 PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 3297 Acsr = (CsrMatrix*)Amat->mat; 3298 if (!biscompressed) { 3299 Bcsr = (CsrMatrix*)Bmat->mat; 3300 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3301 BmatSpDescr = Bmat->matDescr; 3302 #endif 3303 } else { /* we need to use row offsets for the full matrix */ 3304 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 3305 Bcsr = new CsrMatrix; 3306 Bcsr->num_rows = B->rmap->n; 3307 Bcsr->num_cols = cBcsr->num_cols; 3308 Bcsr->num_entries = cBcsr->num_entries; 3309 Bcsr->column_indices = cBcsr->column_indices; 3310 Bcsr->values = cBcsr->values; 3311 if (!Bcusp->rowoffsets_gpu) { 3312 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3313 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 3314 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt))); 3315 } 3316 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 3317 mmdata->Bcsr = Bcsr; 3318 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3319 if (Bcsr->num_rows && Bcsr->num_cols) { 3320 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 3321 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 3322 Bcsr->values->data().get(), 3323 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 3324 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 3325 } 3326 BmatSpDescr = mmdata->matSpBDescr; 3327 #endif 3328 } 3329 PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 3330 PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 3331 /* precompute flops count */ 3332 if (ptype == MATPRODUCT_AB) { 3333 for (i=0, flops = 0; i<A->rmap->n; i++) { 3334 const PetscInt st = a->i[i]; 3335 const PetscInt en = a->i[i+1]; 3336 for (j=st; j<en; j++) { 3337 const PetscInt brow = a->j[j]; 3338 flops += 2.*(b->i[brow+1] - b->i[brow]); 3339 } 3340 } 3341 } else if (ptype == MATPRODUCT_AtB) { 3342 for (i=0, flops = 0; i<A->rmap->n; i++) { 3343 const PetscInt anzi = a->i[i+1] - a->i[i]; 3344 const PetscInt bnzi = b->i[i+1] - b->i[i]; 3345 flops += (2.*anzi)*bnzi; 3346 } 3347 } else { /* TODO */ 3348 flops = 0.; 3349 } 3350 3351 mmdata->flops = flops; 3352 PetscCall(PetscLogGpuTimeBegin()); 3353 3354 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3355 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3356 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 3357 NULL, NULL, NULL, 3358 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 3359 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 3360 PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 3361 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3362 { 3363 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 3364 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 3365 */ 3366 void* dBuffer1 = NULL; 3367 void* dBuffer2 = NULL; 3368 void* dBuffer3 = NULL; 3369 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 3370 size_t bufferSize1 = 0; 3371 size_t bufferSize2 = 0; 3372 size_t bufferSize3 = 0; 3373 size_t bufferSize4 = 0; 3374 size_t bufferSize5 = 0; 3375 3376 /*----------------------------------------------------------------------*/ 3377 /* ask bufferSize1 bytes for external memory */ 3378 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 3379 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 3380 &bufferSize1, NULL);PetscCallCUSPARSE(stat); 3381 PetscCallCUDA(cudaMalloc((void**) &dBuffer1, bufferSize1)); 3382 /* inspect the matrices A and B to understand the memory requirement for the next step */ 3383 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 3384 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 3385 &bufferSize1, dBuffer1);PetscCallCUSPARSE(stat); 3386 3387 /*----------------------------------------------------------------------*/ 3388 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 3389 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 3390 &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);PetscCallCUSPARSE(stat); 3391 PetscCallCUDA(cudaMalloc((void**) &dBuffer2, bufferSize2)); 3392 PetscCallCUDA(cudaMalloc((void**) &dBuffer3, bufferSize3)); 3393 PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4)); 3394 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 3395 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 3396 &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);PetscCallCUSPARSE(stat); 3397 PetscCallCUDA(cudaFree(dBuffer1)); 3398 PetscCallCUDA(cudaFree(dBuffer2)); 3399 3400 /*----------------------------------------------------------------------*/ 3401 /* get matrix C non-zero entries C_nnz1 */ 3402 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3403 c->nz = (PetscInt) C_nnz1; 3404 /* allocate matrix C */ 3405 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3406 Ccsr->values = new THRUSTARRAY(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3407 /* update matC with the new pointers */ 3408 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 3409 Ccsr->values->data().get());PetscCallCUSPARSE(stat); 3410 3411 /*----------------------------------------------------------------------*/ 3412 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 3413 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 3414 &bufferSize5, NULL);PetscCallCUSPARSE(stat); 3415 PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5)); 3416 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 3417 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 3418 &bufferSize5, mmdata->dBuffer5);PetscCallCUSPARSE(stat); 3419 PetscCallCUDA(cudaFree(dBuffer3)); 3420 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 3421 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3422 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 3423 mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 3424 PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024)); 3425 } 3426 #else 3427 size_t bufSize2; 3428 /* ask bufferSize bytes for external memory */ 3429 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 3430 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3431 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 3432 mmdata->spgemmDesc, &bufSize2, NULL);PetscCallCUSPARSE(stat); 3433 PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2)); 3434 /* inspect the matrices A and B to understand the memory requirement for the next step */ 3435 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 3436 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3437 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 3438 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);PetscCallCUSPARSE(stat); 3439 /* ask bufferSize again bytes for external memory */ 3440 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 3441 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3442 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 3443 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);PetscCallCUSPARSE(stat); 3444 /* The CUSPARSE documentation is not clear, nor the API 3445 We need both buffers to perform the operations properly! 3446 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 3447 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 3448 is stored in the descriptor! What a messy API... */ 3449 PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize)); 3450 /* compute the intermediate product of A * B */ 3451 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 3452 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3453 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 3454 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat); 3455 /* get matrix C non-zero entries C_nnz1 */ 3456 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3457 c->nz = (PetscInt) C_nnz1; 3458 PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024)); 3459 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3460 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3461 Ccsr->values = new THRUSTARRAY(c->nz); 3462 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3463 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 3464 Ccsr->values->data().get());PetscCallCUSPARSE(stat); 3465 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 3466 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3467 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 3468 #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3469 #else 3470 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST)); 3471 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, 3472 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 3473 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 3474 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 3475 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);PetscCallCUSPARSE(stat); 3476 c->nz = cnz; 3477 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3478 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3479 Ccsr->values = new THRUSTARRAY(c->nz); 3480 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3481 3482 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3483 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 3484 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 3485 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 3486 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 3487 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 3488 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 3489 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 3490 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat); 3491 #endif 3492 PetscCall(PetscLogGpuFlops(mmdata->flops)); 3493 PetscCall(PetscLogGpuTimeEnd()); 3494 finalizesym: 3495 c->singlemalloc = PETSC_FALSE; 3496 c->free_a = PETSC_TRUE; 3497 c->free_ij = PETSC_TRUE; 3498 PetscCall(PetscMalloc1(m+1,&c->i)); 3499 PetscCall(PetscMalloc1(c->nz,&c->j)); 3500 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 3501 PetscInt *d_i = c->i; 3502 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 3503 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 3504 ii = *Ccsr->row_offsets; 3505 jj = *Ccsr->column_indices; 3506 if (ciscompressed) d_i = c->compressedrow.i; 3507 PetscCallCUDA(cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 3508 PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 3509 } else { 3510 PetscInt *d_i = c->i; 3511 if (ciscompressed) d_i = c->compressedrow.i; 3512 PetscCallCUDA(cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 3513 PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 3514 } 3515 if (ciscompressed) { /* need to expand host row offsets */ 3516 PetscInt r = 0; 3517 c->i[0] = 0; 3518 for (k = 0; k < c->compressedrow.nrows; k++) { 3519 const PetscInt next = c->compressedrow.rindex[k]; 3520 const PetscInt old = c->compressedrow.i[k]; 3521 for (; r < next; r++) c->i[r+1] = old; 3522 } 3523 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 3524 } 3525 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt))); 3526 PetscCall(PetscMalloc1(m,&c->ilen)); 3527 PetscCall(PetscMalloc1(m,&c->imax)); 3528 c->maxnz = c->nz; 3529 c->nonzerorowcnt = 0; 3530 c->rmax = 0; 3531 for (k = 0; k < m; k++) { 3532 const PetscInt nn = c->i[k+1] - c->i[k]; 3533 c->ilen[k] = c->imax[k] = nn; 3534 c->nonzerorowcnt += (PetscInt)!!nn; 3535 c->rmax = PetscMax(c->rmax,nn); 3536 } 3537 PetscCall(MatMarkDiagonal_SeqAIJ(C)); 3538 PetscCall(PetscMalloc1(c->nz,&c->a)); 3539 Ccsr->num_entries = c->nz; 3540 3541 C->nonzerostate++; 3542 PetscCall(PetscLayoutSetUp(C->rmap)); 3543 PetscCall(PetscLayoutSetUp(C->cmap)); 3544 Ccusp->nonzerostate = C->nonzerostate; 3545 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3546 C->preallocated = PETSC_TRUE; 3547 C->assembled = PETSC_FALSE; 3548 C->was_assembled = PETSC_FALSE; 3549 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 3550 mmdata->reusesym = PETSC_TRUE; 3551 C->offloadmask = PETSC_OFFLOAD_GPU; 3552 } 3553 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3554 PetscFunctionReturn(0); 3555 } 3556 3557 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 3558 3559 /* handles sparse or dense B */ 3560 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 3561 { 3562 Mat_Product *product = mat->product; 3563 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 3564 3565 PetscFunctionBegin; 3566 MatCheckProduct(mat,1); 3567 PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense)); 3568 if (!product->A->boundtocpu && !product->B->boundtocpu) { 3569 PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp)); 3570 } 3571 if (product->type == MATPRODUCT_ABC) { 3572 Ciscusp = PETSC_FALSE; 3573 if (!product->C->boundtocpu) { 3574 PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp)); 3575 } 3576 } 3577 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 3578 PetscBool usecpu = PETSC_FALSE; 3579 switch (product->type) { 3580 case MATPRODUCT_AB: 3581 if (product->api_user) { 3582 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat"); 3583 PetscCall(PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL)); 3584 PetscOptionsEnd(); 3585 } else { 3586 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat"); 3587 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL)); 3588 PetscOptionsEnd(); 3589 } 3590 break; 3591 case MATPRODUCT_AtB: 3592 if (product->api_user) { 3593 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat"); 3594 PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL)); 3595 PetscOptionsEnd(); 3596 } else { 3597 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat"); 3598 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL)); 3599 PetscOptionsEnd(); 3600 } 3601 break; 3602 case MATPRODUCT_PtAP: 3603 if (product->api_user) { 3604 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat"); 3605 PetscCall(PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL)); 3606 PetscOptionsEnd(); 3607 } else { 3608 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat"); 3609 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL)); 3610 PetscOptionsEnd(); 3611 } 3612 break; 3613 case MATPRODUCT_RARt: 3614 if (product->api_user) { 3615 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat"); 3616 PetscCall(PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL)); 3617 PetscOptionsEnd(); 3618 } else { 3619 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat"); 3620 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL)); 3621 PetscOptionsEnd(); 3622 } 3623 break; 3624 case MATPRODUCT_ABC: 3625 if (product->api_user) { 3626 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat"); 3627 PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL)); 3628 PetscOptionsEnd(); 3629 } else { 3630 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat"); 3631 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL)); 3632 PetscOptionsEnd(); 3633 } 3634 break; 3635 default: 3636 break; 3637 } 3638 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 3639 } 3640 /* dispatch */ 3641 if (isdense) { 3642 switch (product->type) { 3643 case MATPRODUCT_AB: 3644 case MATPRODUCT_AtB: 3645 case MATPRODUCT_ABt: 3646 case MATPRODUCT_PtAP: 3647 case MATPRODUCT_RARt: 3648 if (product->A->boundtocpu) { 3649 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 3650 } else { 3651 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 3652 } 3653 break; 3654 case MATPRODUCT_ABC: 3655 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3656 break; 3657 default: 3658 break; 3659 } 3660 } else if (Biscusp && Ciscusp) { 3661 switch (product->type) { 3662 case MATPRODUCT_AB: 3663 case MATPRODUCT_AtB: 3664 case MATPRODUCT_ABt: 3665 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3666 break; 3667 case MATPRODUCT_PtAP: 3668 case MATPRODUCT_RARt: 3669 case MATPRODUCT_ABC: 3670 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3671 break; 3672 default: 3673 break; 3674 } 3675 } else { /* fallback for AIJ */ 3676 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 3677 } 3678 PetscFunctionReturn(0); 3679 } 3680 3681 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 3682 { 3683 PetscFunctionBegin; 3684 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE)); 3685 PetscFunctionReturn(0); 3686 } 3687 3688 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 3689 { 3690 PetscFunctionBegin; 3691 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE)); 3692 PetscFunctionReturn(0); 3693 } 3694 3695 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 3696 { 3697 PetscFunctionBegin; 3698 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE)); 3699 PetscFunctionReturn(0); 3700 } 3701 3702 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3703 { 3704 PetscFunctionBegin; 3705 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE)); 3706 PetscFunctionReturn(0); 3707 } 3708 3709 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 3710 { 3711 PetscFunctionBegin; 3712 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE)); 3713 PetscFunctionReturn(0); 3714 } 3715 3716 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 3717 { 3718 int i = blockIdx.x*blockDim.x + threadIdx.x; 3719 if (i < n) y[idx[i]] += x[i]; 3720 } 3721 3722 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 3723 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 3724 { 3725 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3726 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 3727 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3728 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 3729 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3730 PetscBool compressed; 3731 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3732 PetscInt nx,ny; 3733 #endif 3734 3735 PetscFunctionBegin; 3736 PetscCheck(!herm || trans,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported"); 3737 if (!a->nz) { 3738 if (!yy) PetscCall(VecSet_SeqCUDA(zz,0)); 3739 else PetscCall(VecCopy_SeqCUDA(yy,zz)); 3740 PetscFunctionReturn(0); 3741 } 3742 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 3743 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3744 if (!trans) { 3745 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3746 PetscCheck(matstruct,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3747 } else { 3748 if (herm || !A->form_explicit_transpose) { 3749 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3750 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3751 } else { 3752 if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3753 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 3754 } 3755 } 3756 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3757 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3758 3759 try { 3760 PetscCall(VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray)); 3761 if (yy == zz) PetscCall(VecCUDAGetArray(zz,&zarray)); /* read & write zz, so need to get uptodate zarray on GPU */ 3762 else PetscCall(VecCUDAGetArrayWrite(zz,&zarray)); /* write zz, so no need to init zarray on GPU */ 3763 3764 PetscCall(PetscLogGpuTimeBegin()); 3765 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3766 /* z = A x + beta y. 3767 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3768 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3769 */ 3770 xptr = xarray; 3771 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3772 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3773 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3774 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3775 allocated to accommodate different uses. So we get the length info directly from mat. 3776 */ 3777 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3778 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3779 nx = mat->num_cols; 3780 ny = mat->num_rows; 3781 } 3782 #endif 3783 } else { 3784 /* z = A^T x + beta y 3785 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3786 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3787 */ 3788 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3789 dptr = zarray; 3790 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3791 if (compressed) { /* Scatter x to work vector */ 3792 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3793 thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3794 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3795 VecCUDAEqualsReverse()); 3796 } 3797 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3798 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3799 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3800 nx = mat->num_rows; 3801 ny = mat->num_cols; 3802 } 3803 #endif 3804 } 3805 3806 /* csr_spmv does y = alpha op(A) x + beta y */ 3807 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3808 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3809 PetscCheck(opA >= 0 && opA <= 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3810 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3811 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype)); 3812 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype)); 3813 PetscCallCUSPARSE(cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 3814 matstruct->matDescr, 3815 matstruct->cuSpMV[opA].vecXDescr, beta, 3816 matstruct->cuSpMV[opA].vecYDescr, 3817 cusparse_scalartype, 3818 cusparsestruct->spmvAlg, 3819 &matstruct->cuSpMV[opA].spmvBufferSize)); 3820 PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize)); 3821 3822 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3823 } else { 3824 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3825 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr)); 3826 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr)); 3827 } 3828 3829 PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, 3830 matstruct->alpha_one, 3831 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 3832 matstruct->cuSpMV[opA].vecXDescr, 3833 beta, 3834 matstruct->cuSpMV[opA].vecYDescr, 3835 cusparse_scalartype, 3836 cusparsestruct->spmvAlg, 3837 matstruct->cuSpMV[opA].spmvBuffer)); 3838 #else 3839 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3840 PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, 3841 mat->num_rows, mat->num_cols, 3842 mat->num_entries, matstruct->alpha_one, matstruct->descr, 3843 mat->values->data().get(), mat->row_offsets->data().get(), 3844 mat->column_indices->data().get(), xptr, beta, 3845 dptr)); 3846 #endif 3847 } else { 3848 if (cusparsestruct->nrows) { 3849 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3850 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3851 #else 3852 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3853 PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, 3854 matstruct->alpha_one, matstruct->descr, hybMat, 3855 xptr, beta, 3856 dptr)); 3857 #endif 3858 } 3859 } 3860 PetscCall(PetscLogGpuTimeEnd()); 3861 3862 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3863 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3864 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3865 PetscCall(VecCopy_SeqCUDA(yy,zz)); /* zz = yy */ 3866 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3867 PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */ 3868 } 3869 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3870 PetscCall(VecSet_SeqCUDA(zz,0)); 3871 } 3872 3873 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3874 if (compressed) { 3875 PetscCall(PetscLogGpuTimeBegin()); 3876 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3877 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3878 prevent that. So I just add a ScatterAdd kernel. 3879 */ 3880 #if 0 3881 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3882 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3883 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3884 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3885 VecCUDAPlusEquals()); 3886 #else 3887 PetscInt n = matstruct->cprowIndices->size(); 3888 ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 3889 #endif 3890 PetscCall(PetscLogGpuTimeEnd()); 3891 } 3892 } else { 3893 if (yy && yy != zz) { 3894 PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */ 3895 } 3896 } 3897 PetscCall(VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray)); 3898 if (yy == zz) PetscCall(VecCUDARestoreArray(zz,&zarray)); 3899 else PetscCall(VecCUDARestoreArrayWrite(zz,&zarray)); 3900 } catch(char *ex) { 3901 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 3902 } 3903 if (yy) { 3904 PetscCall(PetscLogGpuFlops(2.0*a->nz)); 3905 } else { 3906 PetscCall(PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt)); 3907 } 3908 PetscFunctionReturn(0); 3909 } 3910 3911 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3912 { 3913 PetscFunctionBegin; 3914 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE)); 3915 PetscFunctionReturn(0); 3916 } 3917 3918 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 3919 { 3920 PetscObjectState onnz = A->nonzerostate; 3921 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3922 3923 PetscFunctionBegin; 3924 PetscCall(MatAssemblyEnd_SeqAIJ(A,mode)); 3925 if (onnz != A->nonzerostate && cusp->deviceMat) { 3926 3927 PetscCall(PetscInfo(A,"Destroy device mat since nonzerostate changed\n")); 3928 PetscCallCUDA(cudaFree(cusp->deviceMat)); 3929 cusp->deviceMat = NULL; 3930 } 3931 PetscFunctionReturn(0); 3932 } 3933 3934 /* --------------------------------------------------------------------------------*/ 3935 /*@ 3936 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 3937 (the default parallel PETSc format). This matrix will ultimately pushed down 3938 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 3939 assembly performance the user should preallocate the matrix storage by setting 3940 the parameter nz (or the array nnz). By setting these parameters accurately, 3941 performance during matrix assembly can be increased by more than a factor of 50. 3942 3943 Collective 3944 3945 Input Parameters: 3946 + comm - MPI communicator, set to PETSC_COMM_SELF 3947 . m - number of rows 3948 . n - number of columns 3949 . nz - number of nonzeros per row (same for all rows) 3950 - nnz - array containing the number of nonzeros in the various rows 3951 (possibly different for each row) or NULL 3952 3953 Output Parameter: 3954 . A - the matrix 3955 3956 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 3957 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3958 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 3959 3960 Notes: 3961 If nnz is given then nz is ignored 3962 3963 The AIJ format (also called the Yale sparse matrix format or 3964 compressed row storage), is fully compatible with standard Fortran 77 3965 storage. That is, the stored row and column indices can begin at 3966 either one (as in Fortran) or zero. See the users' manual for details. 3967 3968 Specify the preallocated storage with either nz or nnz (not both). 3969 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 3970 allocation. For large problems you MUST preallocate memory or you 3971 will get TERRIBLE performance, see the users' manual chapter on matrices. 3972 3973 By default, this format uses inodes (identical nodes) when possible, to 3974 improve numerical efficiency of matrix-vector products and solves. We 3975 search for consecutive rows with the same nonzero structure, thereby 3976 reusing matrix information to achieve increased efficiency. 3977 3978 Level: intermediate 3979 3980 .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE` 3981 @*/ 3982 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 3983 { 3984 PetscFunctionBegin; 3985 PetscCall(MatCreate(comm,A)); 3986 PetscCall(MatSetSizes(*A,m,n,m,n)); 3987 PetscCall(MatSetType(*A,MATSEQAIJCUSPARSE)); 3988 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz)); 3989 PetscFunctionReturn(0); 3990 } 3991 3992 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3993 { 3994 PetscFunctionBegin; 3995 if (A->factortype == MAT_FACTOR_NONE) { 3996 PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr)); 3997 } else { 3998 PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr)); 3999 } 4000 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL)); 4001 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL)); 4002 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL)); 4003 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL)); 4004 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL)); 4005 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL)); 4006 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL)); 4007 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 4008 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 4009 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL)); 4010 PetscCall(MatDestroy_SeqAIJ(A)); 4011 PetscFunctionReturn(0); 4012 } 4013 4014 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 4015 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 4016 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 4017 { 4018 PetscFunctionBegin; 4019 PetscCall(MatDuplicate_SeqAIJ(A,cpvalues,B)); 4020 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B)); 4021 PetscFunctionReturn(0); 4022 } 4023 4024 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 4025 { 4026 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 4027 Mat_SeqAIJCUSPARSE *cy; 4028 Mat_SeqAIJCUSPARSE *cx; 4029 PetscScalar *ay; 4030 const PetscScalar *ax; 4031 CsrMatrix *csry,*csrx; 4032 4033 PetscFunctionBegin; 4034 cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 4035 cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 4036 if (X->ops->axpy != Y->ops->axpy) { 4037 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE)); 4038 PetscCall(MatAXPY_SeqAIJ(Y,a,X,str)); 4039 PetscFunctionReturn(0); 4040 } 4041 /* if we are here, it means both matrices are bound to GPU */ 4042 PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y)); 4043 PetscCall(MatSeqAIJCUSPARSECopyToGPU(X)); 4044 PetscCheck(cy->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 4045 PetscCheck(cx->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 4046 csry = (CsrMatrix*)cy->mat->mat; 4047 csrx = (CsrMatrix*)cx->mat->mat; 4048 /* see if we can turn this into a cublas axpy */ 4049 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 4050 bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 4051 if (eq) { 4052 eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 4053 } 4054 if (eq) str = SAME_NONZERO_PATTERN; 4055 } 4056 /* spgeam is buggy with one column */ 4057 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 4058 4059 if (str == SUBSET_NONZERO_PATTERN) { 4060 PetscScalar b = 1.0; 4061 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4062 size_t bufferSize; 4063 void *buffer; 4064 #endif 4065 4066 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax)); 4067 PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 4068 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST)); 4069 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4070 PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 4071 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 4072 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 4073 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize)); 4074 PetscCallCUDA(cudaMalloc(&buffer,bufferSize)); 4075 PetscCall(PetscLogGpuTimeBegin()); 4076 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 4077 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 4078 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 4079 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer)); 4080 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 4081 PetscCall(PetscLogGpuTimeEnd()); 4082 PetscCallCUDA(cudaFree(buffer)); 4083 #else 4084 PetscCall(PetscLogGpuTimeBegin()); 4085 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 4086 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 4087 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 4088 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get())); 4089 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 4090 PetscCall(PetscLogGpuTimeEnd()); 4091 #endif 4092 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE)); 4093 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax)); 4094 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 4095 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 4096 } else if (str == SAME_NONZERO_PATTERN) { 4097 cublasHandle_t cublasv2handle; 4098 PetscBLASInt one = 1, bnz = 1; 4099 4100 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax)); 4101 PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 4102 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 4103 PetscCall(PetscBLASIntCast(x->nz,&bnz)); 4104 PetscCall(PetscLogGpuTimeBegin()); 4105 PetscCallCUBLAS(cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one)); 4106 PetscCall(PetscLogGpuFlops(2.0*bnz)); 4107 PetscCall(PetscLogGpuTimeEnd()); 4108 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax)); 4109 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 4110 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 4111 } else { 4112 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE)); 4113 PetscCall(MatAXPY_SeqAIJ(Y,a,X,str)); 4114 } 4115 PetscFunctionReturn(0); 4116 } 4117 4118 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 4119 { 4120 Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 4121 PetscScalar *ay; 4122 cublasHandle_t cublasv2handle; 4123 PetscBLASInt one = 1, bnz = 1; 4124 4125 PetscFunctionBegin; 4126 PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 4127 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 4128 PetscCall(PetscBLASIntCast(y->nz,&bnz)); 4129 PetscCall(PetscLogGpuTimeBegin()); 4130 PetscCallCUBLAS(cublasXscal(cublasv2handle,bnz,&a,ay,one)); 4131 PetscCall(PetscLogGpuFlops(bnz)); 4132 PetscCall(PetscLogGpuTimeEnd()); 4133 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 4134 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 4135 PetscFunctionReturn(0); 4136 } 4137 4138 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 4139 { 4140 PetscBool both = PETSC_FALSE; 4141 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4142 4143 PetscFunctionBegin; 4144 if (A->factortype == MAT_FACTOR_NONE) { 4145 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 4146 if (spptr->mat) { 4147 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 4148 if (matrix->values) { 4149 both = PETSC_TRUE; 4150 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 4151 } 4152 } 4153 if (spptr->matTranspose) { 4154 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 4155 if (matrix->values) { 4156 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 4157 } 4158 } 4159 } 4160 PetscCall(PetscArrayzero(a->a,a->i[A->rmap->n])); 4161 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4162 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 4163 else A->offloadmask = PETSC_OFFLOAD_CPU; 4164 PetscFunctionReturn(0); 4165 } 4166 4167 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 4168 { 4169 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4170 4171 PetscFunctionBegin; 4172 if (A->factortype != MAT_FACTOR_NONE) { 4173 A->boundtocpu = flg; 4174 PetscFunctionReturn(0); 4175 } 4176 if (flg) { 4177 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 4178 4179 A->ops->scale = MatScale_SeqAIJ; 4180 A->ops->axpy = MatAXPY_SeqAIJ; 4181 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 4182 A->ops->mult = MatMult_SeqAIJ; 4183 A->ops->multadd = MatMultAdd_SeqAIJ; 4184 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 4185 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 4186 A->ops->multhermitiantranspose = NULL; 4187 A->ops->multhermitiantransposeadd = NULL; 4188 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 4189 PetscCall(PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps))); 4190 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL)); 4191 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL)); 4192 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL)); 4193 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 4194 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 4195 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL)); 4196 } else { 4197 A->ops->scale = MatScale_SeqAIJCUSPARSE; 4198 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 4199 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 4200 A->ops->mult = MatMult_SeqAIJCUSPARSE; 4201 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 4202 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 4203 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 4204 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 4205 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 4206 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 4207 a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 4208 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 4209 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 4210 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 4211 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 4212 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 4213 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE; 4214 4215 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE)); 4216 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 4217 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 4218 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE)); 4219 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE)); 4220 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 4221 } 4222 A->boundtocpu = flg; 4223 if (flg && a->inode.size) { 4224 a->inode.use = PETSC_TRUE; 4225 } else { 4226 a->inode.use = PETSC_FALSE; 4227 } 4228 PetscFunctionReturn(0); 4229 } 4230 4231 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 4232 { 4233 Mat B; 4234 4235 PetscFunctionBegin; 4236 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */ 4237 if (reuse == MAT_INITIAL_MATRIX) { 4238 PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat)); 4239 } else if (reuse == MAT_REUSE_MATRIX) { 4240 PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN)); 4241 } 4242 B = *newmat; 4243 4244 PetscCall(PetscFree(B->defaultvectype)); 4245 PetscCall(PetscStrallocpy(VECCUDA,&B->defaultvectype)); 4246 4247 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 4248 if (B->factortype == MAT_FACTOR_NONE) { 4249 Mat_SeqAIJCUSPARSE *spptr; 4250 PetscCall(PetscNew(&spptr)); 4251 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 4252 PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream)); 4253 spptr->format = MAT_CUSPARSE_CSR; 4254 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4255 #if CUSPARSE_VERSION > 11301 4256 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 4257 #else 4258 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 4259 #endif 4260 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 4261 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 4262 #endif 4263 B->spptr = spptr; 4264 } else { 4265 Mat_SeqAIJCUSPARSETriFactors *spptr; 4266 4267 PetscCall(PetscNew(&spptr)); 4268 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 4269 PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream)); 4270 B->spptr = spptr; 4271 } 4272 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 4273 } 4274 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 4275 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 4276 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 4277 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 4278 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 4279 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 4280 4281 PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE)); 4282 PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE)); 4283 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE)); 4284 #if defined(PETSC_HAVE_HYPRE) 4285 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE)); 4286 #endif 4287 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE)); 4288 PetscFunctionReturn(0); 4289 } 4290 4291 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 4292 { 4293 PetscFunctionBegin; 4294 PetscCall(MatCreate_SeqAIJ(B)); 4295 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B)); 4296 PetscFunctionReturn(0); 4297 } 4298 4299 /*MC 4300 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 4301 4302 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 4303 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 4304 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 4305 4306 Options Database Keys: 4307 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 4308 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 4309 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 4310 + -mat_cusparse_use_cpu_solve - Do MatSolve on CPU 4311 4312 Level: beginner 4313 4314 .seealso: `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 4315 M*/ 4316 4317 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 4318 4319 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 4320 { 4321 PetscFunctionBegin; 4322 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band)); 4323 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse)); 4324 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse)); 4325 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse)); 4326 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse)); 4327 4328 PetscFunctionReturn(0); 4329 } 4330 4331 static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat) 4332 { 4333 Mat_SeqAIJCUSPARSE* cusp = (Mat_SeqAIJCUSPARSE*)mat->spptr; 4334 4335 PetscFunctionBegin; 4336 if (!cusp) PetscFunctionReturn(0); 4337 delete cusp->cooPerm; 4338 delete cusp->cooPerm_a; 4339 cusp->cooPerm = NULL; 4340 cusp->cooPerm_a = NULL; 4341 if (cusp->use_extended_coo) { 4342 PetscCallCUDA(cudaFree(cusp->jmap_d)); 4343 PetscCallCUDA(cudaFree(cusp->perm_d)); 4344 } 4345 cusp->use_extended_coo = PETSC_FALSE; 4346 PetscFunctionReturn(0); 4347 } 4348 4349 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 4350 { 4351 PetscFunctionBegin; 4352 if (*cusparsestruct) { 4353 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format)); 4354 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format)); 4355 delete (*cusparsestruct)->workVector; 4356 delete (*cusparsestruct)->rowoffsets_gpu; 4357 delete (*cusparsestruct)->cooPerm; 4358 delete (*cusparsestruct)->cooPerm_a; 4359 delete (*cusparsestruct)->csr2csc_i; 4360 if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle)); 4361 if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d)); 4362 if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d)); 4363 PetscCall(PetscFree(*cusparsestruct)); 4364 } 4365 PetscFunctionReturn(0); 4366 } 4367 4368 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 4369 { 4370 PetscFunctionBegin; 4371 if (*mat) { 4372 delete (*mat)->values; 4373 delete (*mat)->column_indices; 4374 delete (*mat)->row_offsets; 4375 delete *mat; 4376 *mat = 0; 4377 } 4378 PetscFunctionReturn(0); 4379 } 4380 4381 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 4382 { 4383 PetscFunctionBegin; 4384 if (*trifactor) { 4385 if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr)); 4386 if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo)); 4387 PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 4388 if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer)); 4389 if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h)); 4390 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4391 if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer)); 4392 #endif 4393 PetscCall(PetscFree(*trifactor)); 4394 } 4395 PetscFunctionReturn(0); 4396 } 4397 4398 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 4399 { 4400 CsrMatrix *mat; 4401 4402 PetscFunctionBegin; 4403 if (*matstruct) { 4404 if ((*matstruct)->mat) { 4405 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 4406 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4407 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 4408 #else 4409 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 4410 PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat)); 4411 #endif 4412 } else { 4413 mat = (CsrMatrix*)(*matstruct)->mat; 4414 CsrMatrix_Destroy(&mat); 4415 } 4416 } 4417 if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr)); 4418 delete (*matstruct)->cprowIndices; 4419 if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one)); 4420 if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero)); 4421 if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one)); 4422 4423 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4424 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 4425 if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr)); 4426 for (int i=0; i<3; i++) { 4427 if (mdata->cuSpMV[i].initialized) { 4428 PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer)); 4429 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr)); 4430 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr)); 4431 } 4432 } 4433 #endif 4434 delete *matstruct; 4435 *matstruct = NULL; 4436 } 4437 PetscFunctionReturn(0); 4438 } 4439 4440 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors) 4441 { 4442 Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors; 4443 4444 PetscFunctionBegin; 4445 if (fs) { 4446 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr)); 4447 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr)); 4448 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose)); 4449 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose)); 4450 delete fs->rpermIndices; 4451 delete fs->cpermIndices; 4452 delete fs->workVector; 4453 fs->rpermIndices = NULL; 4454 fs->cpermIndices = NULL; 4455 fs->workVector = NULL; 4456 if (fs->a_band_d) PetscCallCUDA(cudaFree(fs->a_band_d)); 4457 if (fs->i_band_d) PetscCallCUDA(cudaFree(fs->i_band_d)); 4458 fs->init_dev_prop = PETSC_FALSE; 4459 #if CUSPARSE_VERSION >= 11500 4460 PetscCallCUDA(cudaFree(fs->csrRowPtr)); 4461 PetscCallCUDA(cudaFree(fs->csrColIdx)); 4462 PetscCallCUDA(cudaFree(fs->csrVal)); 4463 PetscCallCUDA(cudaFree(fs->X)); 4464 PetscCallCUDA(cudaFree(fs->Y)); 4465 PetscCallCUDA(cudaFree(fs->factBuffer_M)); 4466 PetscCallCUDA(cudaFree(fs->spsvBuffer_L)); 4467 PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt)); 4468 PetscCallCUDA(cudaFree(fs->spsvBuffer_U)); 4469 PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut)); 4470 PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M)); 4471 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L)); 4472 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U)); 4473 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L)); 4474 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt)); 4475 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U)); 4476 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut)); 4477 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X)); 4478 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y)); 4479 PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M)); 4480 PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M)); 4481 fs->builtSolveTranspose = PETSC_FALSE; 4482 #endif 4483 } 4484 PetscFunctionReturn(0); 4485 } 4486 4487 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 4488 { 4489 cusparseHandle_t handle; 4490 4491 PetscFunctionBegin; 4492 if (*trifactors) { 4493 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors)); 4494 if (handle = (*trifactors)->handle) { 4495 PetscCallCUSPARSE(cusparseDestroy(handle)); 4496 } 4497 PetscCall(PetscFree(*trifactors)); 4498 } 4499 PetscFunctionReturn(0); 4500 } 4501 4502 struct IJCompare 4503 { 4504 __host__ __device__ 4505 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 4506 { 4507 if (t1.get<0>() < t2.get<0>()) return true; 4508 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4509 return false; 4510 } 4511 }; 4512 4513 struct IJEqual 4514 { 4515 __host__ __device__ 4516 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 4517 { 4518 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 4519 return true; 4520 } 4521 }; 4522 4523 struct IJDiff 4524 { 4525 __host__ __device__ 4526 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 4527 { 4528 return t1 == t2 ? 0 : 1; 4529 } 4530 }; 4531 4532 struct IJSum 4533 { 4534 __host__ __device__ 4535 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 4536 { 4537 return t1||t2; 4538 } 4539 }; 4540 4541 #include <thrust/iterator/discard_iterator.h> 4542 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */ 4543 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 4544 { 4545 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4546 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4547 THRUSTARRAY *cooPerm_v = NULL; 4548 thrust::device_ptr<const PetscScalar> d_v; 4549 CsrMatrix *matrix; 4550 PetscInt n; 4551 4552 PetscFunctionBegin; 4553 PetscCheck(cusp,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 4554 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 4555 if (!cusp->cooPerm) { 4556 PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY)); 4557 PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY)); 4558 PetscFunctionReturn(0); 4559 } 4560 matrix = (CsrMatrix*)cusp->mat->mat; 4561 PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4562 if (!v) { 4563 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 4564 goto finalize; 4565 } 4566 n = cusp->cooPerm->size(); 4567 if (isCudaMem(v)) { 4568 d_v = thrust::device_pointer_cast(v); 4569 } else { 4570 cooPerm_v = new THRUSTARRAY(n); 4571 cooPerm_v->assign(v,v+n); 4572 d_v = cooPerm_v->data(); 4573 PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar))); 4574 } 4575 PetscCall(PetscLogGpuTimeBegin()); 4576 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 4577 if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 4578 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 4579 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 4580 /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 4581 cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 4582 cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 4583 */ 4584 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 4585 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 4586 delete cooPerm_w; 4587 } else { 4588 /* all nonzeros in d_v[] are unique entries */ 4589 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 4590 matrix->values->begin())); 4591 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 4592 matrix->values->end())); 4593 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 4594 } 4595 } else { 4596 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 4597 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 4598 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 4599 } else { 4600 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 4601 matrix->values->begin())); 4602 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 4603 matrix->values->end())); 4604 thrust::for_each(zibit,zieit,VecCUDAEquals()); 4605 } 4606 } 4607 PetscCall(PetscLogGpuTimeEnd()); 4608 finalize: 4609 delete cooPerm_v; 4610 A->offloadmask = PETSC_OFFLOAD_GPU; 4611 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4612 /* shorter version of MatAssemblyEnd_SeqAIJ */ 4613 PetscCall(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz)); 4614 PetscCall(PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n")); 4615 PetscCall(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax)); 4616 a->reallocs = 0; 4617 A->info.mallocs += 0; 4618 A->info.nz_unneeded = 0; 4619 A->assembled = A->was_assembled = PETSC_TRUE; 4620 A->num_ass++; 4621 PetscFunctionReturn(0); 4622 } 4623 4624 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 4625 { 4626 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4627 4628 PetscFunctionBegin; 4629 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4630 if (!cusp) PetscFunctionReturn(0); 4631 if (destroy) { 4632 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format)); 4633 delete cusp->csr2csc_i; 4634 cusp->csr2csc_i = NULL; 4635 } 4636 A->transupdated = PETSC_FALSE; 4637 PetscFunctionReturn(0); 4638 } 4639 4640 #include <thrust/binary_search.h> 4641 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */ 4642 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[]) 4643 { 4644 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4645 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4646 PetscInt cooPerm_n, nzr = 0; 4647 4648 PetscFunctionBegin; 4649 PetscCall(PetscLayoutSetUp(A->rmap)); 4650 PetscCall(PetscLayoutSetUp(A->cmap)); 4651 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 4652 if (n != cooPerm_n) { 4653 delete cusp->cooPerm; 4654 delete cusp->cooPerm_a; 4655 cusp->cooPerm = NULL; 4656 cusp->cooPerm_a = NULL; 4657 } 4658 if (n) { 4659 THRUSTINTARRAY d_i(n); 4660 THRUSTINTARRAY d_j(n); 4661 THRUSTINTARRAY ii(A->rmap->n); 4662 4663 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 4664 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 4665 4666 PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt))); 4667 d_i.assign(coo_i,coo_i+n); 4668 d_j.assign(coo_j,coo_j+n); 4669 4670 /* Ex. 4671 n = 6 4672 coo_i = [3,3,1,4,1,4] 4673 coo_j = [3,2,2,5,2,6] 4674 */ 4675 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 4676 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 4677 4678 PetscCall(PetscLogGpuTimeBegin()); 4679 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 4680 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 4681 *cusp->cooPerm_a = d_i; /* copy the sorted array */ 4682 THRUSTINTARRAY w = d_j; 4683 4684 /* 4685 d_i = [1,1,3,3,4,4] 4686 d_j = [2,2,2,3,5,6] 4687 cooPerm = [2,4,1,0,3,5] 4688 */ 4689 auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 4690 4691 /* 4692 d_i = [1,3,3,4,4,x] 4693 ^ekey 4694 d_j = [2,2,3,5,6,x] 4695 ^nekye 4696 */ 4697 if (nekey == ekey) { /* all entries are unique */ 4698 delete cusp->cooPerm_a; 4699 cusp->cooPerm_a = NULL; 4700 } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 4701 /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 4702 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 4703 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 4704 (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 4705 w[0] = 0; 4706 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 4707 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 4708 } 4709 thrust::counting_iterator<PetscInt> search_begin(0); 4710 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 4711 search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 4712 ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 4713 PetscCall(PetscLogGpuTimeEnd()); 4714 4715 PetscCall(MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i)); 4716 a->singlemalloc = PETSC_FALSE; 4717 a->free_a = PETSC_TRUE; 4718 a->free_ij = PETSC_TRUE; 4719 PetscCall(PetscMalloc1(A->rmap->n+1,&a->i)); 4720 a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 4721 PetscCallCUDA(cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 4722 a->nz = a->maxnz = a->i[A->rmap->n]; 4723 a->rmax = 0; 4724 PetscCall(PetscMalloc1(a->nz,&a->a)); 4725 PetscCall(PetscMalloc1(a->nz,&a->j)); 4726 PetscCallCUDA(cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 4727 if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n,&a->ilen)); 4728 if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n,&a->imax)); 4729 for (PetscInt i = 0; i < A->rmap->n; i++) { 4730 const PetscInt nnzr = a->i[i+1] - a->i[i]; 4731 nzr += (PetscInt)!!(nnzr); 4732 a->ilen[i] = a->imax[i] = nnzr; 4733 a->rmax = PetscMax(a->rmax,nnzr); 4734 } 4735 a->nonzerorowcnt = nzr; 4736 A->preallocated = PETSC_TRUE; 4737 PetscCall(PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt))); 4738 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 4739 } else { 4740 PetscCall(MatSeqAIJSetPreallocation(A,0,NULL)); 4741 } 4742 PetscCall(MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE)); 4743 4744 /* We want to allocate the CUSPARSE struct for matvec now. 4745 The code is so convoluted now that I prefer to copy zeros */ 4746 PetscCall(PetscArrayzero(a->a,a->nz)); 4747 PetscCall(MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6)); 4748 A->offloadmask = PETSC_OFFLOAD_CPU; 4749 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4750 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 4751 PetscFunctionReturn(0); 4752 } 4753 4754 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 4755 { 4756 Mat_SeqAIJ *seq; 4757 Mat_SeqAIJCUSPARSE *dev; 4758 PetscBool coo_basic = PETSC_TRUE; 4759 PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 4760 4761 PetscFunctionBegin; 4762 PetscCall(MatResetPreallocationCOO_SeqAIJ(mat)); 4763 PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat)); 4764 if (coo_i) { 4765 PetscCall(PetscGetMemType(coo_i,&mtype)); 4766 if (PetscMemTypeHost(mtype)) { 4767 for (PetscCount k=0; k<coo_n; k++) { 4768 if (coo_i[k] < 0 || coo_j[k] < 0) {coo_basic = PETSC_FALSE; break;} 4769 } 4770 } 4771 } 4772 4773 if (coo_basic) { /* i,j are on device or do not contain negative indices */ 4774 PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j)); 4775 } else { 4776 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j)); 4777 mat->offloadmask = PETSC_OFFLOAD_CPU; 4778 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat)); 4779 seq = static_cast<Mat_SeqAIJ*>(mat->data); 4780 dev = static_cast<Mat_SeqAIJCUSPARSE*>(mat->spptr); 4781 PetscCallCUDA(cudaMalloc((void**)&dev->jmap_d,(seq->nz+1)*sizeof(PetscCount))); 4782 PetscCallCUDA(cudaMemcpy(dev->jmap_d,seq->jmap,(seq->nz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice)); 4783 PetscCallCUDA(cudaMalloc((void**)&dev->perm_d,seq->Atot*sizeof(PetscCount))); 4784 PetscCallCUDA(cudaMemcpy(dev->perm_d,seq->perm,seq->Atot*sizeof(PetscCount),cudaMemcpyHostToDevice)); 4785 dev->use_extended_coo = PETSC_TRUE; 4786 } 4787 PetscFunctionReturn(0); 4788 } 4789 4790 __global__ static void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount jmap[],const PetscCount perm[],InsertMode imode,PetscScalar a[]) 4791 { 4792 PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 4793 const PetscCount grid_size = gridDim.x * blockDim.x; 4794 for (; i<nnz; i+= grid_size) { 4795 PetscScalar sum = 0.0; 4796 for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) sum += kv[perm[k]]; 4797 a[i] = (imode == INSERT_VALUES? 0.0 : a[i]) + sum; 4798 } 4799 } 4800 4801 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 4802 { 4803 Mat_SeqAIJ *seq = (Mat_SeqAIJ*)A->data; 4804 Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE*)A->spptr; 4805 PetscCount Annz = seq->nz; 4806 PetscMemType memtype; 4807 const PetscScalar *v1 = v; 4808 PetscScalar *Aa; 4809 4810 PetscFunctionBegin; 4811 if (dev->use_extended_coo) { 4812 PetscCall(PetscGetMemType(v,&memtype)); 4813 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 4814 PetscCallCUDA(cudaMalloc((void**)&v1,seq->coo_n*sizeof(PetscScalar))); 4815 PetscCallCUDA(cudaMemcpy((void*)v1,v,seq->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4816 } 4817 4818 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa)); 4819 else PetscCall(MatSeqAIJCUSPARSEGetArray(A,&Aa)); 4820 4821 if (Annz) { 4822 MatAddCOOValues<<<(Annz+255)/256,256>>>(v1,Annz,dev->jmap_d,dev->perm_d,imode,Aa); 4823 PetscCallCUDA(cudaPeekAtLastError()); 4824 } 4825 4826 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa)); 4827 else PetscCall(MatSeqAIJCUSPARSERestoreArray(A,&Aa)); 4828 4829 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void*)v1)); 4830 } else { 4831 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A,v,imode)); 4832 } 4833 PetscFunctionReturn(0); 4834 } 4835 4836 /*@C 4837 MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices. 4838 4839 Not collective 4840 4841 Input Parameters: 4842 + A - the matrix 4843 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4844 4845 Output Parameters: 4846 + ia - the CSR row pointers 4847 - ja - the CSR column indices 4848 4849 Level: developer 4850 4851 Notes: 4852 When compressed is true, the CSR structure does not contain empty rows 4853 4854 .seealso: `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()` 4855 @*/ 4856 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j) 4857 { 4858 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4859 CsrMatrix *csr; 4860 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4861 4862 PetscFunctionBegin; 4863 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4864 if (!i || !j) PetscFunctionReturn(0); 4865 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4866 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4867 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4868 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4869 csr = (CsrMatrix*)cusp->mat->mat; 4870 if (i) { 4871 if (!compressed && a->compressedrow.use) { /* need full row offset */ 4872 if (!cusp->rowoffsets_gpu) { 4873 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4874 cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4875 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 4876 } 4877 *i = cusp->rowoffsets_gpu->data().get(); 4878 } else *i = csr->row_offsets->data().get(); 4879 } 4880 if (j) *j = csr->column_indices->data().get(); 4881 PetscFunctionReturn(0); 4882 } 4883 4884 /*@C 4885 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ() 4886 4887 Not collective 4888 4889 Input Parameters: 4890 + A - the matrix 4891 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4892 4893 Output Parameters: 4894 + ia - the CSR row pointers 4895 - ja - the CSR column indices 4896 4897 Level: developer 4898 4899 .seealso: `MatSeqAIJCUSPARSEGetIJ()` 4900 @*/ 4901 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j) 4902 { 4903 PetscFunctionBegin; 4904 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4905 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4906 if (i) *i = NULL; 4907 if (j) *j = NULL; 4908 PetscFunctionReturn(0); 4909 } 4910 4911 /*@C 4912 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4913 4914 Not Collective 4915 4916 Input Parameter: 4917 . A - a MATSEQAIJCUSPARSE matrix 4918 4919 Output Parameter: 4920 . a - pointer to the device data 4921 4922 Level: developer 4923 4924 Notes: may trigger host-device copies if up-to-date matrix data is on host 4925 4926 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()` 4927 @*/ 4928 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 4929 { 4930 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4931 CsrMatrix *csr; 4932 4933 PetscFunctionBegin; 4934 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4935 PetscValidPointer(a,2); 4936 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4937 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4938 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4939 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4940 csr = (CsrMatrix*)cusp->mat->mat; 4941 PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4942 *a = csr->values->data().get(); 4943 PetscFunctionReturn(0); 4944 } 4945 4946 /*@C 4947 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead() 4948 4949 Not Collective 4950 4951 Input Parameter: 4952 . A - a MATSEQAIJCUSPARSE matrix 4953 4954 Output Parameter: 4955 . a - pointer to the device data 4956 4957 Level: developer 4958 4959 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()` 4960 @*/ 4961 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 4962 { 4963 PetscFunctionBegin; 4964 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4965 PetscValidPointer(a,2); 4966 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4967 *a = NULL; 4968 PetscFunctionReturn(0); 4969 } 4970 4971 /*@C 4972 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4973 4974 Not Collective 4975 4976 Input Parameter: 4977 . A - a MATSEQAIJCUSPARSE matrix 4978 4979 Output Parameter: 4980 . a - pointer to the device data 4981 4982 Level: developer 4983 4984 Notes: may trigger host-device copies if up-to-date matrix data is on host 4985 4986 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()` 4987 @*/ 4988 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 4989 { 4990 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4991 CsrMatrix *csr; 4992 4993 PetscFunctionBegin; 4994 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4995 PetscValidPointer(a,2); 4996 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4997 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4998 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4999 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 5000 csr = (CsrMatrix*)cusp->mat->mat; 5001 PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 5002 *a = csr->values->data().get(); 5003 A->offloadmask = PETSC_OFFLOAD_GPU; 5004 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 5005 PetscFunctionReturn(0); 5006 } 5007 /*@C 5008 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray() 5009 5010 Not Collective 5011 5012 Input Parameter: 5013 . A - a MATSEQAIJCUSPARSE matrix 5014 5015 Output Parameter: 5016 . a - pointer to the device data 5017 5018 Level: developer 5019 5020 .seealso: `MatSeqAIJCUSPARSEGetArray()` 5021 @*/ 5022 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 5023 { 5024 PetscFunctionBegin; 5025 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 5026 PetscValidPointer(a,2); 5027 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 5028 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 5029 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 5030 *a = NULL; 5031 PetscFunctionReturn(0); 5032 } 5033 5034 /*@C 5035 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 5036 5037 Not Collective 5038 5039 Input Parameter: 5040 . A - a MATSEQAIJCUSPARSE matrix 5041 5042 Output Parameter: 5043 . a - pointer to the device data 5044 5045 Level: developer 5046 5047 Notes: does not trigger host-device copies and flags data validity on the GPU 5048 5049 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()` 5050 @*/ 5051 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 5052 { 5053 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 5054 CsrMatrix *csr; 5055 5056 PetscFunctionBegin; 5057 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 5058 PetscValidPointer(a,2); 5059 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 5060 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 5061 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 5062 csr = (CsrMatrix*)cusp->mat->mat; 5063 PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 5064 *a = csr->values->data().get(); 5065 A->offloadmask = PETSC_OFFLOAD_GPU; 5066 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 5067 PetscFunctionReturn(0); 5068 } 5069 5070 /*@C 5071 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite() 5072 5073 Not Collective 5074 5075 Input Parameter: 5076 . A - a MATSEQAIJCUSPARSE matrix 5077 5078 Output Parameter: 5079 . a - pointer to the device data 5080 5081 Level: developer 5082 5083 .seealso: `MatSeqAIJCUSPARSEGetArrayWrite()` 5084 @*/ 5085 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 5086 { 5087 PetscFunctionBegin; 5088 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 5089 PetscValidPointer(a,2); 5090 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 5091 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 5092 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 5093 *a = NULL; 5094 PetscFunctionReturn(0); 5095 } 5096 5097 struct IJCompare4 5098 { 5099 __host__ __device__ 5100 inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 5101 { 5102 if (t1.get<0>() < t2.get<0>()) return true; 5103 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 5104 return false; 5105 } 5106 }; 5107 5108 struct Shift 5109 { 5110 int _shift; 5111 5112 Shift(int shift) : _shift(shift) {} 5113 __host__ __device__ 5114 inline int operator() (const int &c) 5115 { 5116 return c + _shift; 5117 } 5118 }; 5119 5120 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 5121 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 5122 { 5123 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 5124 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 5125 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 5126 CsrMatrix *Acsr,*Bcsr,*Ccsr; 5127 PetscInt Annz,Bnnz; 5128 cusparseStatus_t stat; 5129 PetscInt i,m,n,zero = 0; 5130 5131 PetscFunctionBegin; 5132 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 5133 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 5134 PetscValidPointer(C,4); 5135 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 5136 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 5137 PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 5138 PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 5139 PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 5140 PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 5141 if (reuse == MAT_INITIAL_MATRIX) { 5142 m = A->rmap->n; 5143 n = A->cmap->n + B->cmap->n; 5144 PetscCall(MatCreate(PETSC_COMM_SELF,C)); 5145 PetscCall(MatSetSizes(*C,m,n,m,n)); 5146 PetscCall(MatSetType(*C,MATSEQAIJCUSPARSE)); 5147 c = (Mat_SeqAIJ*)(*C)->data; 5148 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 5149 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 5150 Ccsr = new CsrMatrix; 5151 Cmat->cprowIndices = NULL; 5152 c->compressedrow.use = PETSC_FALSE; 5153 c->compressedrow.nrows = 0; 5154 c->compressedrow.i = NULL; 5155 c->compressedrow.rindex = NULL; 5156 Ccusp->workVector = NULL; 5157 Ccusp->nrows = m; 5158 Ccusp->mat = Cmat; 5159 Ccusp->mat->mat = Ccsr; 5160 Ccsr->num_rows = m; 5161 Ccsr->num_cols = n; 5162 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 5163 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 5164 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 5165 PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar))); 5166 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar))); 5167 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 5168 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 5169 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 5170 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 5171 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 5172 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 5173 PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 5174 PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 5175 5176 Acsr = (CsrMatrix*)Acusp->mat->mat; 5177 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 5178 Annz = (PetscInt)Acsr->column_indices->size(); 5179 Bnnz = (PetscInt)Bcsr->column_indices->size(); 5180 c->nz = Annz + Bnnz; 5181 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 5182 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 5183 Ccsr->values = new THRUSTARRAY(c->nz); 5184 Ccsr->num_entries = c->nz; 5185 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 5186 if (c->nz) { 5187 auto Acoo = new THRUSTINTARRAY32(Annz); 5188 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 5189 auto Ccoo = new THRUSTINTARRAY32(c->nz); 5190 THRUSTINTARRAY32 *Aroff,*Broff; 5191 5192 if (a->compressedrow.use) { /* need full row offset */ 5193 if (!Acusp->rowoffsets_gpu) { 5194 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 5195 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 5196 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 5197 } 5198 Aroff = Acusp->rowoffsets_gpu; 5199 } else Aroff = Acsr->row_offsets; 5200 if (b->compressedrow.use) { /* need full row offset */ 5201 if (!Bcusp->rowoffsets_gpu) { 5202 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 5203 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 5204 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt))); 5205 } 5206 Broff = Bcusp->rowoffsets_gpu; 5207 } else Broff = Bcsr->row_offsets; 5208 PetscCall(PetscLogGpuTimeBegin()); 5209 stat = cusparseXcsr2coo(Acusp->handle, 5210 Aroff->data().get(), 5211 Annz, 5212 m, 5213 Acoo->data().get(), 5214 CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat); 5215 stat = cusparseXcsr2coo(Bcusp->handle, 5216 Broff->data().get(), 5217 Bnnz, 5218 m, 5219 Bcoo->data().get(), 5220 CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat); 5221 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 5222 auto Aperm = thrust::make_constant_iterator(1); 5223 auto Bperm = thrust::make_constant_iterator(0); 5224 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 5225 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 5226 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 5227 #else 5228 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 5229 auto Bcib = Bcsr->column_indices->begin(); 5230 auto Bcie = Bcsr->column_indices->end(); 5231 thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 5232 #endif 5233 auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 5234 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 5235 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 5236 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 5237 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 5238 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 5239 auto p1 = Ccusp->cooPerm->begin(); 5240 auto p2 = Ccusp->cooPerm->begin(); 5241 thrust::advance(p2,Annz); 5242 PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 5243 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 5244 thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 5245 #endif 5246 auto cci = thrust::make_counting_iterator(zero); 5247 auto cce = thrust::make_counting_iterator(c->nz); 5248 #if 0 //Errors on SUMMIT cuda 11.1.0 5249 PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 5250 #else 5251 auto pred = thrust::identity<int>(); 5252 PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 5253 PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 5254 #endif 5255 stat = cusparseXcoo2csr(Ccusp->handle, 5256 Ccoo->data().get(), 5257 c->nz, 5258 m, 5259 Ccsr->row_offsets->data().get(), 5260 CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat); 5261 PetscCall(PetscLogGpuTimeEnd()); 5262 delete wPerm; 5263 delete Acoo; 5264 delete Bcoo; 5265 delete Ccoo; 5266 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 5267 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 5268 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 5269 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 5270 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 5271 #endif 5272 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 5273 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 5274 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 5275 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 5276 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 5277 CsrMatrix *CcsrT = new CsrMatrix; 5278 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 5279 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 5280 5281 (*C)->form_explicit_transpose = PETSC_TRUE; 5282 (*C)->transupdated = PETSC_TRUE; 5283 Ccusp->rowoffsets_gpu = NULL; 5284 CmatT->cprowIndices = NULL; 5285 CmatT->mat = CcsrT; 5286 CcsrT->num_rows = n; 5287 CcsrT->num_cols = m; 5288 CcsrT->num_entries = c->nz; 5289 5290 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 5291 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 5292 CcsrT->values = new THRUSTARRAY(c->nz); 5293 5294 PetscCall(PetscLogGpuTimeBegin()); 5295 auto rT = CcsrT->row_offsets->begin(); 5296 if (AT) { 5297 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 5298 thrust::advance(rT,-1); 5299 } 5300 if (BT) { 5301 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 5302 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 5303 thrust::copy(titb,tite,rT); 5304 } 5305 auto cT = CcsrT->column_indices->begin(); 5306 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 5307 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 5308 auto vT = CcsrT->values->begin(); 5309 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 5310 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 5311 PetscCall(PetscLogGpuTimeEnd()); 5312 5313 PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr)); 5314 PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO)); 5315 PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 5316 PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar))); 5317 PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar))); 5318 PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar))); 5319 PetscCallCUDA(cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 5320 PetscCallCUDA(cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 5321 PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 5322 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 5323 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 5324 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 5325 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 5326 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 5327 #endif 5328 Ccusp->matTranspose = CmatT; 5329 } 5330 } 5331 5332 c->singlemalloc = PETSC_FALSE; 5333 c->free_a = PETSC_TRUE; 5334 c->free_ij = PETSC_TRUE; 5335 PetscCall(PetscMalloc1(m+1,&c->i)); 5336 PetscCall(PetscMalloc1(c->nz,&c->j)); 5337 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 5338 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 5339 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 5340 ii = *Ccsr->row_offsets; 5341 jj = *Ccsr->column_indices; 5342 PetscCallCUDA(cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 5343 PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 5344 } else { 5345 PetscCallCUDA(cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 5346 PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 5347 } 5348 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt))); 5349 PetscCall(PetscMalloc1(m,&c->ilen)); 5350 PetscCall(PetscMalloc1(m,&c->imax)); 5351 c->maxnz = c->nz; 5352 c->nonzerorowcnt = 0; 5353 c->rmax = 0; 5354 for (i = 0; i < m; i++) { 5355 const PetscInt nn = c->i[i+1] - c->i[i]; 5356 c->ilen[i] = c->imax[i] = nn; 5357 c->nonzerorowcnt += (PetscInt)!!nn; 5358 c->rmax = PetscMax(c->rmax,nn); 5359 } 5360 PetscCall(MatMarkDiagonal_SeqAIJ(*C)); 5361 PetscCall(PetscMalloc1(c->nz,&c->a)); 5362 (*C)->nonzerostate++; 5363 PetscCall(PetscLayoutSetUp((*C)->rmap)); 5364 PetscCall(PetscLayoutSetUp((*C)->cmap)); 5365 Ccusp->nonzerostate = (*C)->nonzerostate; 5366 (*C)->preallocated = PETSC_TRUE; 5367 } else { 5368 PetscCheck((*C)->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n); 5369 c = (Mat_SeqAIJ*)(*C)->data; 5370 if (c->nz) { 5371 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 5372 PetscCheck(Ccusp->cooPerm,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 5373 PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 5374 PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate,PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 5375 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 5376 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 5377 PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 5378 PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 5379 Acsr = (CsrMatrix*)Acusp->mat->mat; 5380 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 5381 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 5382 PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size()); 5383 PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 5384 PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 5385 PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries,PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 5386 PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 5387 auto pmid = Ccusp->cooPerm->begin(); 5388 thrust::advance(pmid,Acsr->num_entries); 5389 PetscCall(PetscLogGpuTimeBegin()); 5390 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 5391 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 5392 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 5393 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 5394 thrust::for_each(zibait,zieait,VecCUDAEquals()); 5395 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 5396 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 5397 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 5398 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 5399 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 5400 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE)); 5401 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 5402 PetscCheck(Ccusp->matTranspose,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 5403 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 5404 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 5405 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 5406 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 5407 auto vT = CcsrT->values->begin(); 5408 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 5409 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 5410 (*C)->transupdated = PETSC_TRUE; 5411 } 5412 PetscCall(PetscLogGpuTimeEnd()); 5413 } 5414 } 5415 PetscCall(PetscObjectStateIncrease((PetscObject)*C)); 5416 (*C)->assembled = PETSC_TRUE; 5417 (*C)->was_assembled = PETSC_FALSE; 5418 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 5419 PetscFunctionReturn(0); 5420 } 5421 5422 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 5423 { 5424 bool dmem; 5425 const PetscScalar *av; 5426 5427 PetscFunctionBegin; 5428 dmem = isCudaMem(v); 5429 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A,&av)); 5430 if (n && idx) { 5431 THRUSTINTARRAY widx(n); 5432 widx.assign(idx,idx+n); 5433 PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt))); 5434 5435 THRUSTARRAY *w = NULL; 5436 thrust::device_ptr<PetscScalar> dv; 5437 if (dmem) { 5438 dv = thrust::device_pointer_cast(v); 5439 } else { 5440 w = new THRUSTARRAY(n); 5441 dv = w->data(); 5442 } 5443 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 5444 5445 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 5446 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 5447 thrust::for_each(zibit,zieit,VecCUDAEquals()); 5448 if (w) { 5449 PetscCallCUDA(cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost)); 5450 } 5451 delete w; 5452 } else { 5453 PetscCallCUDA(cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost)); 5454 } 5455 if (!dmem) PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar))); 5456 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A,&av)); 5457 PetscFunctionReturn(0); 5458 } 5459