1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 7 8 #include <petscconf.h> 9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10 #include <../src/mat/impls/sbaij/seq/sbaij.h> 11 #include <../src/vec/vec/impls/dvecimpl.h> 12 #include <petsc/private/vecimpl.h> 13 #undef VecType 14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15 #include <thrust/adjacent_difference.h> 16 #include <thrust/async/for_each.h> 17 #include <thrust/iterator/constant_iterator.h> 18 #include <thrust/remove.h> 19 #include <thrust/sort.h> 20 #include <thrust/unique.h> 21 22 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 23 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 24 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 25 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 26 27 typedef enum { 28 CUSPARSE_MV_ALG_DEFAULT = 0, 29 CUSPARSE_COOMV_ALG = 1, 30 CUSPARSE_CSRMV_ALG1 = 2, 31 CUSPARSE_CSRMV_ALG2 = 3 32 } cusparseSpMVAlg_t; 33 34 typedef enum { 35 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 36 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 37 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 38 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 39 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 40 CUSPARSE_SPMM_ALG_DEFAULT = 0, 41 CUSPARSE_SPMM_COO_ALG1 = 1, 42 CUSPARSE_SPMM_COO_ALG2 = 2, 43 CUSPARSE_SPMM_COO_ALG3 = 3, 44 CUSPARSE_SPMM_COO_ALG4 = 5, 45 CUSPARSE_SPMM_CSR_ALG1 = 4, 46 CUSPARSE_SPMM_CSR_ALG2 = 6, 47 } cusparseSpMMAlg_t; 48 49 typedef enum { 50 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 51 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 52 } cusparseCsr2CscAlg_t; 53 */ 54 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 55 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 56 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 57 #endif 58 59 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 60 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 61 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 62 63 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 64 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 65 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 66 67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 71 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 72 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 73 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 74 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 75 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 76 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 77 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 78 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 79 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 80 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 81 82 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 83 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 84 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 85 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 86 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 87 88 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 89 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 90 91 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 92 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscCount,const PetscInt[],const PetscInt[]); 93 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 94 95 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 96 { 97 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 98 99 PetscFunctionBegin; 100 switch (op) { 101 case MAT_CUSPARSE_MULT: 102 cusparsestruct->format = format; 103 break; 104 case MAT_CUSPARSE_ALL: 105 cusparsestruct->format = format; 106 break; 107 default: 108 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 109 } 110 PetscFunctionReturn(0); 111 } 112 113 /*@ 114 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 115 operation. Only the MatMult operation can use different GPU storage formats 116 for MPIAIJCUSPARSE matrices. 117 Not Collective 118 119 Input Parameters: 120 + A - Matrix of type SEQAIJCUSPARSE 121 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 122 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 123 124 Output Parameter: 125 126 Level: intermediate 127 128 .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 129 @*/ 130 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 131 { 132 PetscFunctionBegin; 133 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 134 PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format)); 135 PetscFunctionReturn(0); 136 } 137 138 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu) 139 { 140 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 141 142 PetscFunctionBegin; 143 cusparsestruct->use_cpu_solve = use_cpu; 144 PetscFunctionReturn(0); 145 } 146 147 /*@ 148 MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve. 149 150 Input Parameters: 151 + A - Matrix of type SEQAIJCUSPARSE 152 - use_cpu - set flag for using the built-in CPU MatSolve 153 154 Output Parameter: 155 156 Notes: 157 The cuSparse LU solver currently computes the factors with the built-in CPU method 158 and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 159 This method to specify if the solve is done on the CPU or GPU (GPU is the default). 160 161 Level: intermediate 162 163 .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 164 @*/ 165 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu) 166 { 167 PetscFunctionBegin; 168 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 169 PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu)); 170 PetscFunctionReturn(0); 171 } 172 173 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 174 { 175 PetscFunctionBegin; 176 switch (op) { 177 case MAT_FORM_EXPLICIT_TRANSPOSE: 178 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 179 if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 180 A->form_explicit_transpose = flg; 181 break; 182 default: 183 PetscCall(MatSetOption_SeqAIJ(A,op,flg)); 184 break; 185 } 186 PetscFunctionReturn(0); 187 } 188 189 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 190 191 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 192 { 193 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 194 IS isrow = b->row,iscol = b->col; 195 PetscBool row_identity,col_identity; 196 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr; 197 198 PetscFunctionBegin; 199 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 200 PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info)); 201 B->offloadmask = PETSC_OFFLOAD_CPU; 202 /* determine which version of MatSolve needs to be used. */ 203 PetscCall(ISIdentity(isrow,&row_identity)); 204 PetscCall(ISIdentity(iscol,&col_identity)); 205 206 if (!cusparsestruct->use_cpu_solve) { 207 if (row_identity && col_identity) { 208 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 209 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 210 } else { 211 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 212 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 213 } 214 } 215 B->ops->matsolve = NULL; 216 B->ops->matsolvetranspose = NULL; 217 218 /* get the triangular factors */ 219 if (!cusparsestruct->use_cpu_solve) { 220 PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B)); 221 } 222 PetscFunctionReturn(0); 223 } 224 225 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 226 { 227 MatCUSPARSEStorageFormat format; 228 PetscBool flg; 229 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 230 231 PetscFunctionBegin; 232 PetscOptionsHeadBegin(PetscOptionsObject,"SeqAIJCUSPARSE options"); 233 if (A->factortype == MAT_FACTOR_NONE) { 234 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 235 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg)); 236 if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format)); 237 238 PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 239 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg)); 240 if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format)); 241 PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg)); 242 if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve)); 243 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 244 PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 245 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg)); 246 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 247 #if CUSPARSE_VERSION > 11301 248 PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 249 #else 250 PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 251 #endif 252 PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 253 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg)); 254 PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 255 256 PetscCall(PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 257 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg)); 258 PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 259 #endif 260 } 261 PetscOptionsHeadEnd(); 262 PetscFunctionReturn(0); 263 } 264 265 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 266 { 267 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 268 269 PetscFunctionBegin; 270 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 271 PetscCall(MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 272 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 273 PetscFunctionReturn(0); 274 } 275 276 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 277 { 278 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 279 280 PetscFunctionBegin; 281 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 282 PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 283 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 284 PetscFunctionReturn(0); 285 } 286 287 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 288 { 289 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 290 291 PetscFunctionBegin; 292 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 293 PetscCall(MatICCFactorSymbolic_SeqAIJ(B,A,perm,info)); 294 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 295 PetscFunctionReturn(0); 296 } 297 298 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 299 { 300 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 301 302 PetscFunctionBegin; 303 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 304 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info)); 305 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 306 PetscFunctionReturn(0); 307 } 308 309 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 310 { 311 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 312 PetscInt n = A->rmap->n; 313 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 314 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 315 const PetscInt *ai = a->i,*aj = a->j,*vi; 316 const MatScalar *aa = a->a,*v; 317 PetscInt *AiLo, *AjLo; 318 PetscInt i,nz, nzLower, offset, rowOffset; 319 320 PetscFunctionBegin; 321 if (!n) PetscFunctionReturn(0); 322 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 323 try { 324 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 325 nzLower=n+ai[n]-ai[1]; 326 if (!loTriFactor) { 327 PetscScalar *AALo; 328 329 PetscCallCUDA(cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar))); 330 331 /* Allocate Space for the lower triangular matrix */ 332 PetscCallCUDA(cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt))); 333 PetscCallCUDA(cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt))); 334 335 /* Fill the lower triangular matrix */ 336 AiLo[0] = (PetscInt) 0; 337 AiLo[n] = nzLower; 338 AjLo[0] = (PetscInt) 0; 339 AALo[0] = (MatScalar) 1.0; 340 v = aa; 341 vi = aj; 342 offset = 1; 343 rowOffset= 1; 344 for (i=1; i<n; i++) { 345 nz = ai[i+1] - ai[i]; 346 /* additional 1 for the term on the diagonal */ 347 AiLo[i] = rowOffset; 348 rowOffset += nz+1; 349 350 PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz)); 351 PetscCall(PetscArraycpy(&(AALo[offset]), v, nz)); 352 353 offset += nz; 354 AjLo[offset] = (PetscInt) i; 355 AALo[offset] = (MatScalar) 1.0; 356 offset += 1; 357 358 v += nz; 359 vi += nz; 360 } 361 362 /* allocate space for the triangular factor information */ 363 PetscCall(PetscNew(&loTriFactor)); 364 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 365 /* Create the matrix description */ 366 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 367 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 368 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 369 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 370 #else 371 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 372 #endif 373 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER)); 374 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 375 376 /* set the operation */ 377 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 378 379 /* set the matrix */ 380 loTriFactor->csrMat = new CsrMatrix; 381 loTriFactor->csrMat->num_rows = n; 382 loTriFactor->csrMat->num_cols = n; 383 loTriFactor->csrMat->num_entries = nzLower; 384 385 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 386 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 387 388 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 389 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 390 391 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 392 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 393 394 /* Create the solve analysis information */ 395 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 396 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 397 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 398 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 399 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 400 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 401 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 402 &loTriFactor->solveBufferSize)); 403 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize)); 404 #endif 405 406 /* perform the solve analysis */ 407 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 408 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 409 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 410 loTriFactor->csrMat->column_indices->data().get(), 411 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 412 loTriFactor->solveInfo, 413 loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 414 #else 415 loTriFactor->solveInfo)); 416 #endif 417 PetscCallCUDA(WaitForCUDA()); 418 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 419 420 /* assign the pointer */ 421 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 422 loTriFactor->AA_h = AALo; 423 PetscCallCUDA(cudaFreeHost(AiLo)); 424 PetscCallCUDA(cudaFreeHost(AjLo)); 425 PetscCall(PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar))); 426 } else { /* update values only */ 427 if (!loTriFactor->AA_h) { 428 PetscCallCUDA(cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar))); 429 } 430 /* Fill the lower triangular matrix */ 431 loTriFactor->AA_h[0] = 1.0; 432 v = aa; 433 vi = aj; 434 offset = 1; 435 for (i=1; i<n; i++) { 436 nz = ai[i+1] - ai[i]; 437 PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz)); 438 offset += nz; 439 loTriFactor->AA_h[offset] = 1.0; 440 offset += 1; 441 v += nz; 442 } 443 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 444 PetscCall(PetscLogCpuToGpu(nzLower*sizeof(PetscScalar))); 445 } 446 } catch(char *ex) { 447 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 448 } 449 } 450 PetscFunctionReturn(0); 451 } 452 453 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 454 { 455 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 456 PetscInt n = A->rmap->n; 457 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 458 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 459 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 460 const MatScalar *aa = a->a,*v; 461 PetscInt *AiUp, *AjUp; 462 PetscInt i,nz, nzUpper, offset; 463 464 PetscFunctionBegin; 465 if (!n) PetscFunctionReturn(0); 466 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 467 try { 468 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 469 nzUpper = adiag[0]-adiag[n]; 470 if (!upTriFactor) { 471 PetscScalar *AAUp; 472 473 PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar))); 474 475 /* Allocate Space for the upper triangular matrix */ 476 PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt))); 477 PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt))); 478 479 /* Fill the upper triangular matrix */ 480 AiUp[0]=(PetscInt) 0; 481 AiUp[n]=nzUpper; 482 offset = nzUpper; 483 for (i=n-1; i>=0; i--) { 484 v = aa + adiag[i+1] + 1; 485 vi = aj + adiag[i+1] + 1; 486 487 /* number of elements NOT on the diagonal */ 488 nz = adiag[i] - adiag[i+1]-1; 489 490 /* decrement the offset */ 491 offset -= (nz+1); 492 493 /* first, set the diagonal elements */ 494 AjUp[offset] = (PetscInt) i; 495 AAUp[offset] = (MatScalar)1./v[nz]; 496 AiUp[i] = AiUp[i+1] - (nz+1); 497 498 PetscCall(PetscArraycpy(&(AjUp[offset+1]), vi, nz)); 499 PetscCall(PetscArraycpy(&(AAUp[offset+1]), v, nz)); 500 } 501 502 /* allocate space for the triangular factor information */ 503 PetscCall(PetscNew(&upTriFactor)); 504 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 505 506 /* Create the matrix description */ 507 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 508 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 509 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 510 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 511 #else 512 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 513 #endif 514 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 515 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 516 517 /* set the operation */ 518 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 519 520 /* set the matrix */ 521 upTriFactor->csrMat = new CsrMatrix; 522 upTriFactor->csrMat->num_rows = n; 523 upTriFactor->csrMat->num_cols = n; 524 upTriFactor->csrMat->num_entries = nzUpper; 525 526 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 527 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 528 529 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 530 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 531 532 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 533 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 534 535 /* Create the solve analysis information */ 536 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 537 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 538 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 539 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 540 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 541 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 542 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 543 &upTriFactor->solveBufferSize)); 544 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize)); 545 #endif 546 547 /* perform the solve analysis */ 548 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 549 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 550 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 551 upTriFactor->csrMat->column_indices->data().get(), 552 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 553 upTriFactor->solveInfo, 554 upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 555 #else 556 upTriFactor->solveInfo)); 557 #endif 558 PetscCallCUDA(WaitForCUDA()); 559 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 560 561 /* assign the pointer */ 562 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 563 upTriFactor->AA_h = AAUp; 564 PetscCallCUDA(cudaFreeHost(AiUp)); 565 PetscCallCUDA(cudaFreeHost(AjUp)); 566 PetscCall(PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar))); 567 } else { 568 if (!upTriFactor->AA_h) { 569 PetscCallCUDA(cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar))); 570 } 571 /* Fill the upper triangular matrix */ 572 offset = nzUpper; 573 for (i=n-1; i>=0; i--) { 574 v = aa + adiag[i+1] + 1; 575 576 /* number of elements NOT on the diagonal */ 577 nz = adiag[i] - adiag[i+1]-1; 578 579 /* decrement the offset */ 580 offset -= (nz+1); 581 582 /* first, set the diagonal elements */ 583 upTriFactor->AA_h[offset] = 1./v[nz]; 584 PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz)); 585 } 586 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 587 PetscCall(PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar))); 588 } 589 } catch(char *ex) { 590 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 591 } 592 } 593 PetscFunctionReturn(0); 594 } 595 596 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 597 { 598 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 599 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 600 IS isrow = a->row,iscol = a->icol; 601 PetscBool row_identity,col_identity; 602 PetscInt n = A->rmap->n; 603 604 PetscFunctionBegin; 605 PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 606 PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A)); 607 PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A)); 608 609 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 610 cusparseTriFactors->nnz=a->nz; 611 612 A->offloadmask = PETSC_OFFLOAD_BOTH; 613 /* lower triangular indices */ 614 PetscCall(ISIdentity(isrow,&row_identity)); 615 if (!row_identity && !cusparseTriFactors->rpermIndices) { 616 const PetscInt *r; 617 618 PetscCall(ISGetIndices(isrow,&r)); 619 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 620 cusparseTriFactors->rpermIndices->assign(r, r+n); 621 PetscCall(ISRestoreIndices(isrow,&r)); 622 PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt))); 623 } 624 625 /* upper triangular indices */ 626 PetscCall(ISIdentity(iscol,&col_identity)); 627 if (!col_identity && !cusparseTriFactors->cpermIndices) { 628 const PetscInt *c; 629 630 PetscCall(ISGetIndices(iscol,&c)); 631 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 632 cusparseTriFactors->cpermIndices->assign(c, c+n); 633 PetscCall(ISRestoreIndices(iscol,&c)); 634 PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt))); 635 } 636 PetscFunctionReturn(0); 637 } 638 639 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 640 { 641 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 642 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 643 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 644 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 645 PetscInt *AiUp, *AjUp; 646 PetscScalar *AAUp; 647 PetscScalar *AALo; 648 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 649 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 650 const PetscInt *ai = b->i,*aj = b->j,*vj; 651 const MatScalar *aa = b->a,*v; 652 653 PetscFunctionBegin; 654 if (!n) PetscFunctionReturn(0); 655 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 656 try { 657 PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar))); 658 PetscCallCUDA(cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar))); 659 if (!upTriFactor && !loTriFactor) { 660 /* Allocate Space for the upper triangular matrix */ 661 PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt))); 662 PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt))); 663 664 /* Fill the upper triangular matrix */ 665 AiUp[0]=(PetscInt) 0; 666 AiUp[n]=nzUpper; 667 offset = 0; 668 for (i=0; i<n; i++) { 669 /* set the pointers */ 670 v = aa + ai[i]; 671 vj = aj + ai[i]; 672 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 673 674 /* first, set the diagonal elements */ 675 AjUp[offset] = (PetscInt) i; 676 AAUp[offset] = (MatScalar)1.0/v[nz]; 677 AiUp[i] = offset; 678 AALo[offset] = (MatScalar)1.0/v[nz]; 679 680 offset+=1; 681 if (nz>0) { 682 PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz)); 683 PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 684 for (j=offset; j<offset+nz; j++) { 685 AAUp[j] = -AAUp[j]; 686 AALo[j] = AAUp[j]/v[nz]; 687 } 688 offset+=nz; 689 } 690 } 691 692 /* allocate space for the triangular factor information */ 693 PetscCall(PetscNew(&upTriFactor)); 694 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 695 696 /* Create the matrix description */ 697 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 698 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 699 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 700 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 701 #else 702 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 703 #endif 704 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 705 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 706 707 /* set the matrix */ 708 upTriFactor->csrMat = new CsrMatrix; 709 upTriFactor->csrMat->num_rows = A->rmap->n; 710 upTriFactor->csrMat->num_cols = A->cmap->n; 711 upTriFactor->csrMat->num_entries = a->nz; 712 713 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 714 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 715 716 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 717 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 718 719 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 720 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 721 722 /* set the operation */ 723 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 724 725 /* Create the solve analysis information */ 726 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 727 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 728 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 729 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 730 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 731 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 732 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 733 &upTriFactor->solveBufferSize)); 734 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize)); 735 #endif 736 737 /* perform the solve analysis */ 738 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 739 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 740 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 741 upTriFactor->csrMat->column_indices->data().get(), 742 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 743 upTriFactor->solveInfo, 744 upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 745 #else 746 upTriFactor->solveInfo)); 747 #endif 748 PetscCallCUDA(WaitForCUDA()); 749 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 750 751 /* assign the pointer */ 752 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 753 754 /* allocate space for the triangular factor information */ 755 PetscCall(PetscNew(&loTriFactor)); 756 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 757 758 /* Create the matrix description */ 759 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 760 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 761 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 762 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 763 #else 764 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 765 #endif 766 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 767 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 768 769 /* set the operation */ 770 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 771 772 /* set the matrix */ 773 loTriFactor->csrMat = new CsrMatrix; 774 loTriFactor->csrMat->num_rows = A->rmap->n; 775 loTriFactor->csrMat->num_cols = A->cmap->n; 776 loTriFactor->csrMat->num_entries = a->nz; 777 778 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 779 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 780 781 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 782 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 783 784 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 785 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 786 787 /* Create the solve analysis information */ 788 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 789 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 790 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 791 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 792 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 793 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 794 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 795 &loTriFactor->solveBufferSize)); 796 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize)); 797 #endif 798 799 /* perform the solve analysis */ 800 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 801 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 802 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 803 loTriFactor->csrMat->column_indices->data().get(), 804 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 805 loTriFactor->solveInfo, 806 loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 807 #else 808 loTriFactor->solveInfo)); 809 #endif 810 PetscCallCUDA(WaitForCUDA()); 811 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 812 813 /* assign the pointer */ 814 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 815 816 PetscCall(PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))); 817 PetscCallCUDA(cudaFreeHost(AiUp)); 818 PetscCallCUDA(cudaFreeHost(AjUp)); 819 } else { 820 /* Fill the upper triangular matrix */ 821 offset = 0; 822 for (i=0; i<n; i++) { 823 /* set the pointers */ 824 v = aa + ai[i]; 825 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 826 827 /* first, set the diagonal elements */ 828 AAUp[offset] = 1.0/v[nz]; 829 AALo[offset] = 1.0/v[nz]; 830 831 offset+=1; 832 if (nz>0) { 833 PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 834 for (j=offset; j<offset+nz; j++) { 835 AAUp[j] = -AAUp[j]; 836 AALo[j] = AAUp[j]/v[nz]; 837 } 838 offset+=nz; 839 } 840 } 841 PetscCheck(upTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 842 PetscCheck(loTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 843 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 844 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 845 PetscCall(PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar))); 846 } 847 PetscCallCUDA(cudaFreeHost(AAUp)); 848 PetscCallCUDA(cudaFreeHost(AALo)); 849 } catch(char *ex) { 850 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 851 } 852 } 853 PetscFunctionReturn(0); 854 } 855 856 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 857 { 858 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 859 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 860 IS ip = a->row; 861 PetscBool perm_identity; 862 PetscInt n = A->rmap->n; 863 864 PetscFunctionBegin; 865 PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 866 PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A)); 867 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 868 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 869 870 A->offloadmask = PETSC_OFFLOAD_BOTH; 871 872 /* lower triangular indices */ 873 PetscCall(ISIdentity(ip,&perm_identity)); 874 if (!perm_identity) { 875 IS iip; 876 const PetscInt *irip,*rip; 877 878 PetscCall(ISInvertPermutation(ip,PETSC_DECIDE,&iip)); 879 PetscCall(ISGetIndices(iip,&irip)); 880 PetscCall(ISGetIndices(ip,&rip)); 881 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 882 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 883 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 884 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 885 PetscCall(ISRestoreIndices(iip,&irip)); 886 PetscCall(ISDestroy(&iip)); 887 PetscCall(ISRestoreIndices(ip,&rip)); 888 PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt))); 889 } 890 PetscFunctionReturn(0); 891 } 892 893 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 894 { 895 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 896 IS ip = b->row; 897 PetscBool perm_identity; 898 899 PetscFunctionBegin; 900 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 901 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B,A,info)); 902 B->offloadmask = PETSC_OFFLOAD_CPU; 903 /* determine which version of MatSolve needs to be used. */ 904 PetscCall(ISIdentity(ip,&perm_identity)); 905 if (perm_identity) { 906 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 907 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 908 B->ops->matsolve = NULL; 909 B->ops->matsolvetranspose = NULL; 910 } else { 911 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 912 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 913 B->ops->matsolve = NULL; 914 B->ops->matsolvetranspose = NULL; 915 } 916 917 /* get the triangular factors */ 918 PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B)); 919 PetscFunctionReturn(0); 920 } 921 922 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 923 { 924 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 925 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 926 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 927 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 928 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 929 cusparseIndexBase_t indexBase; 930 cusparseMatrixType_t matrixType; 931 cusparseFillMode_t fillMode; 932 cusparseDiagType_t diagType; 933 934 PetscFunctionBegin; 935 /* allocate space for the transpose of the lower triangular factor */ 936 PetscCall(PetscNew(&loTriFactorT)); 937 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 938 939 /* set the matrix descriptors of the lower triangular factor */ 940 matrixType = cusparseGetMatType(loTriFactor->descr); 941 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 942 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 943 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 944 diagType = cusparseGetMatDiagType(loTriFactor->descr); 945 946 /* Create the matrix description */ 947 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr)); 948 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 949 PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType)); 950 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode)); 951 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType)); 952 953 /* set the operation */ 954 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 955 956 /* allocate GPU space for the CSC of the lower triangular factor*/ 957 loTriFactorT->csrMat = new CsrMatrix; 958 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 959 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 960 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 961 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 962 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 963 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 964 965 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 966 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 967 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 968 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 969 loTriFactor->csrMat->values->data().get(), 970 loTriFactor->csrMat->row_offsets->data().get(), 971 loTriFactor->csrMat->column_indices->data().get(), 972 loTriFactorT->csrMat->values->data().get(), 973 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 974 CUSPARSE_ACTION_NUMERIC,indexBase, 975 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 976 PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize)); 977 #endif 978 979 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 980 PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 981 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 982 loTriFactor->csrMat->values->data().get(), 983 loTriFactor->csrMat->row_offsets->data().get(), 984 loTriFactor->csrMat->column_indices->data().get(), 985 loTriFactorT->csrMat->values->data().get(), 986 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 987 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 988 CUSPARSE_ACTION_NUMERIC, indexBase, 989 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer)); 990 #else 991 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 992 CUSPARSE_ACTION_NUMERIC, indexBase)); 993 #endif 994 PetscCallCUDA(WaitForCUDA()); 995 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 996 997 /* Create the solve analysis information */ 998 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 999 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo)); 1000 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1001 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1002 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1003 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1004 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1005 &loTriFactorT->solveBufferSize)); 1006 PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize)); 1007 #endif 1008 1009 /* perform the solve analysis */ 1010 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1011 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1012 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1013 loTriFactorT->csrMat->column_indices->data().get(), 1014 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1015 loTriFactorT->solveInfo, 1016 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1017 #else 1018 loTriFactorT->solveInfo)); 1019 #endif 1020 PetscCallCUDA(WaitForCUDA()); 1021 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1022 1023 /* assign the pointer */ 1024 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1025 1026 /*********************************************/ 1027 /* Now the Transpose of the Upper Tri Factor */ 1028 /*********************************************/ 1029 1030 /* allocate space for the transpose of the upper triangular factor */ 1031 PetscCall(PetscNew(&upTriFactorT)); 1032 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1033 1034 /* set the matrix descriptors of the upper triangular factor */ 1035 matrixType = cusparseGetMatType(upTriFactor->descr); 1036 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1037 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1038 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1039 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1040 1041 /* Create the matrix description */ 1042 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr)); 1043 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 1044 PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType)); 1045 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode)); 1046 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType)); 1047 1048 /* set the operation */ 1049 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1050 1051 /* allocate GPU space for the CSC of the upper triangular factor*/ 1052 upTriFactorT->csrMat = new CsrMatrix; 1053 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1054 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1055 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1056 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1057 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1058 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1059 1060 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1061 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1062 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1063 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1064 upTriFactor->csrMat->values->data().get(), 1065 upTriFactor->csrMat->row_offsets->data().get(), 1066 upTriFactor->csrMat->column_indices->data().get(), 1067 upTriFactorT->csrMat->values->data().get(), 1068 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1069 CUSPARSE_ACTION_NUMERIC,indexBase, 1070 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 1071 PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize)); 1072 #endif 1073 1074 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1075 PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1076 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1077 upTriFactor->csrMat->values->data().get(), 1078 upTriFactor->csrMat->row_offsets->data().get(), 1079 upTriFactor->csrMat->column_indices->data().get(), 1080 upTriFactorT->csrMat->values->data().get(), 1081 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1082 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1083 CUSPARSE_ACTION_NUMERIC, indexBase, 1084 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer)); 1085 #else 1086 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1087 CUSPARSE_ACTION_NUMERIC, indexBase)); 1088 #endif 1089 1090 PetscCallCUDA(WaitForCUDA()); 1091 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1092 1093 /* Create the solve analysis information */ 1094 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1095 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo)); 1096 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1097 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1098 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1099 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1100 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1101 &upTriFactorT->solveBufferSize)); 1102 PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize)); 1103 #endif 1104 1105 /* perform the solve analysis */ 1106 /* christ, would it have killed you to put this stuff in a function????????? */ 1107 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1108 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1109 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1110 upTriFactorT->csrMat->column_indices->data().get(), 1111 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1112 upTriFactorT->solveInfo, 1113 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1114 #else 1115 upTriFactorT->solveInfo)); 1116 #endif 1117 1118 PetscCallCUDA(WaitForCUDA()); 1119 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1120 1121 /* assign the pointer */ 1122 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1123 PetscFunctionReturn(0); 1124 } 1125 1126 struct PetscScalarToPetscInt 1127 { 1128 __host__ __device__ 1129 PetscInt operator()(PetscScalar s) 1130 { 1131 return (PetscInt)PetscRealPart(s); 1132 } 1133 }; 1134 1135 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1136 { 1137 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1138 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1139 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1140 cusparseStatus_t stat; 1141 cusparseIndexBase_t indexBase; 1142 1143 PetscFunctionBegin; 1144 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1145 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1146 PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct"); 1147 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1148 PetscCheck(!A->transupdated || matstructT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct"); 1149 if (A->transupdated) PetscFunctionReturn(0); 1150 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1151 PetscCall(PetscLogGpuTimeBegin()); 1152 if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1153 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 1154 } 1155 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1156 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1157 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr)); 1158 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1159 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase)); 1160 PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1161 1162 /* set alpha and beta */ 1163 PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar))); 1164 PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar))); 1165 PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar))); 1166 PetscCallCUDA(cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1167 PetscCallCUDA(cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1168 PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1169 1170 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1171 CsrMatrix *matrixT = new CsrMatrix; 1172 matstructT->mat = matrixT; 1173 matrixT->num_rows = A->cmap->n; 1174 matrixT->num_cols = A->rmap->n; 1175 matrixT->num_entries = a->nz; 1176 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1177 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1178 matrixT->values = new THRUSTARRAY(a->nz); 1179 1180 if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 1181 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1182 1183 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1184 #if PETSC_PKG_CUDA_VERSION_GE(11,2,1) 1185 stat = cusparseCreateCsr(&matstructT->matDescr, 1186 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1187 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1188 matrixT->values->data().get(), 1189 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1190 indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat); 1191 #else 1192 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 1193 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 1194 1195 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 1196 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 1197 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 1198 */ 1199 if (matrixT->num_entries) { 1200 stat = cusparseCreateCsr(&matstructT->matDescr, 1201 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1202 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1203 matrixT->values->data().get(), 1204 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, 1205 indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat); 1206 1207 } else { 1208 matstructT->matDescr = NULL; 1209 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1210 } 1211 #endif 1212 #endif 1213 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1214 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1215 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1216 #else 1217 CsrMatrix *temp = new CsrMatrix; 1218 CsrMatrix *tempT = new CsrMatrix; 1219 /* First convert HYB to CSR */ 1220 temp->num_rows = A->rmap->n; 1221 temp->num_cols = A->cmap->n; 1222 temp->num_entries = a->nz; 1223 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1224 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1225 temp->values = new THRUSTARRAY(a->nz); 1226 1227 stat = cusparse_hyb2csr(cusparsestruct->handle, 1228 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1229 temp->values->data().get(), 1230 temp->row_offsets->data().get(), 1231 temp->column_indices->data().get());PetscCallCUSPARSE(stat); 1232 1233 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1234 tempT->num_rows = A->rmap->n; 1235 tempT->num_cols = A->cmap->n; 1236 tempT->num_entries = a->nz; 1237 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1238 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1239 tempT->values = new THRUSTARRAY(a->nz); 1240 1241 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1242 temp->num_cols, temp->num_entries, 1243 temp->values->data().get(), 1244 temp->row_offsets->data().get(), 1245 temp->column_indices->data().get(), 1246 tempT->values->data().get(), 1247 tempT->column_indices->data().get(), 1248 tempT->row_offsets->data().get(), 1249 CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat); 1250 1251 /* Last, convert CSC to HYB */ 1252 cusparseHybMat_t hybMat; 1253 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 1254 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1255 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1256 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1257 matstructT->descr, tempT->values->data().get(), 1258 tempT->row_offsets->data().get(), 1259 tempT->column_indices->data().get(), 1260 hybMat, 0, partition);PetscCallCUSPARSE(stat); 1261 1262 /* assign the pointer */ 1263 matstructT->mat = hybMat; 1264 A->transupdated = PETSC_TRUE; 1265 /* delete temporaries */ 1266 if (tempT) { 1267 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1268 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1269 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1270 delete (CsrMatrix*) tempT; 1271 } 1272 if (temp) { 1273 if (temp->values) delete (THRUSTARRAY*) temp->values; 1274 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1275 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1276 delete (CsrMatrix*) temp; 1277 } 1278 #endif 1279 } 1280 } 1281 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1282 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1283 CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1284 PetscCheck(matrix,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix"); 1285 PetscCheck(matrix->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows"); 1286 PetscCheck(matrix->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols"); 1287 PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values"); 1288 PetscCheck(matrixT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT"); 1289 PetscCheck(matrixT->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows"); 1290 PetscCheck(matrixT->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols"); 1291 PetscCheck(matrixT->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values"); 1292 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1293 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1294 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1295 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 1296 } 1297 if (!cusparsestruct->csr2csc_i) { 1298 THRUSTARRAY csr2csc_a(matrix->num_entries); 1299 PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1300 1301 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1302 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1303 void *csr2cscBuffer; 1304 size_t csr2cscBufferSize; 1305 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1306 A->cmap->n, matrix->num_entries, 1307 matrix->values->data().get(), 1308 cusparsestruct->rowoffsets_gpu->data().get(), 1309 matrix->column_indices->data().get(), 1310 matrixT->values->data().get(), 1311 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1312 CUSPARSE_ACTION_NUMERIC,indexBase, 1313 cusparsestruct->csr2cscAlg, &csr2cscBufferSize);PetscCallCUSPARSE(stat); 1314 PetscCallCUDA(cudaMalloc(&csr2cscBuffer,csr2cscBufferSize)); 1315 #endif 1316 1317 if (matrix->num_entries) { 1318 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1319 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1320 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1321 1322 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1323 should be filled with indexBase. So I just take a shortcut here. 1324 */ 1325 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1326 A->cmap->n,matrix->num_entries, 1327 csr2csc_a.data().get(), 1328 cusparsestruct->rowoffsets_gpu->data().get(), 1329 matrix->column_indices->data().get(), 1330 matrixT->values->data().get(), 1331 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1332 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1333 CUSPARSE_ACTION_NUMERIC,indexBase, 1334 cusparsestruct->csr2cscAlg, csr2cscBuffer);PetscCallCUSPARSE(stat); 1335 #else 1336 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1337 CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat); 1338 #endif 1339 } else { 1340 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1341 } 1342 1343 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1344 PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1345 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1346 PetscCallCUDA(cudaFree(csr2cscBuffer)); 1347 #endif 1348 } 1349 PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1350 thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1351 matrixT->values->begin())); 1352 } 1353 PetscCall(PetscLogGpuTimeEnd()); 1354 PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1355 /* the compressed row indices is not used for matTranspose */ 1356 matstructT->cprowIndices = NULL; 1357 /* assign the pointer */ 1358 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1359 A->transupdated = PETSC_TRUE; 1360 PetscFunctionReturn(0); 1361 } 1362 1363 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1364 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1365 { 1366 PetscInt n = xx->map->n; 1367 const PetscScalar *barray; 1368 PetscScalar *xarray; 1369 thrust::device_ptr<const PetscScalar> bGPU; 1370 thrust::device_ptr<PetscScalar> xGPU; 1371 cusparseStatus_t stat; 1372 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1373 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1374 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1375 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1376 1377 PetscFunctionBegin; 1378 /* Analyze the matrix and create the transpose ... on the fly */ 1379 if (!loTriFactorT && !upTriFactorT) { 1380 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1381 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1382 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1383 } 1384 1385 /* Get the GPU pointers */ 1386 PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 1387 PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1388 xGPU = thrust::device_pointer_cast(xarray); 1389 bGPU = thrust::device_pointer_cast(barray); 1390 1391 PetscCall(PetscLogGpuTimeBegin()); 1392 /* First, reorder with the row permutation */ 1393 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1394 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1395 xGPU); 1396 1397 /* First, solve U */ 1398 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1399 upTriFactorT->csrMat->num_rows, 1400 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1401 upTriFactorT->csrMat->num_entries, 1402 #endif 1403 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1404 upTriFactorT->csrMat->values->data().get(), 1405 upTriFactorT->csrMat->row_offsets->data().get(), 1406 upTriFactorT->csrMat->column_indices->data().get(), 1407 upTriFactorT->solveInfo, 1408 xarray, 1409 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1410 tempGPU->data().get(), 1411 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1412 #else 1413 tempGPU->data().get());PetscCallCUSPARSE(stat); 1414 #endif 1415 1416 /* Then, solve L */ 1417 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1418 loTriFactorT->csrMat->num_rows, 1419 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1420 loTriFactorT->csrMat->num_entries, 1421 #endif 1422 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1423 loTriFactorT->csrMat->values->data().get(), 1424 loTriFactorT->csrMat->row_offsets->data().get(), 1425 loTriFactorT->csrMat->column_indices->data().get(), 1426 loTriFactorT->solveInfo, 1427 tempGPU->data().get(), 1428 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1429 xarray, 1430 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1431 #else 1432 xarray);PetscCallCUSPARSE(stat); 1433 #endif 1434 1435 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1436 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1437 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1438 tempGPU->begin()); 1439 1440 /* Copy the temporary to the full solution. */ 1441 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1442 1443 /* restore */ 1444 PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 1445 PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 1446 PetscCall(PetscLogGpuTimeEnd()); 1447 PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1448 PetscFunctionReturn(0); 1449 } 1450 1451 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1452 { 1453 const PetscScalar *barray; 1454 PetscScalar *xarray; 1455 cusparseStatus_t stat; 1456 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1457 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1458 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1459 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1460 1461 PetscFunctionBegin; 1462 /* Analyze the matrix and create the transpose ... on the fly */ 1463 if (!loTriFactorT && !upTriFactorT) { 1464 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1465 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1466 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1467 } 1468 1469 /* Get the GPU pointers */ 1470 PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 1471 PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1472 1473 PetscCall(PetscLogGpuTimeBegin()); 1474 /* First, solve U */ 1475 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1476 upTriFactorT->csrMat->num_rows, 1477 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1478 upTriFactorT->csrMat->num_entries, 1479 #endif 1480 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1481 upTriFactorT->csrMat->values->data().get(), 1482 upTriFactorT->csrMat->row_offsets->data().get(), 1483 upTriFactorT->csrMat->column_indices->data().get(), 1484 upTriFactorT->solveInfo, 1485 barray, 1486 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1487 tempGPU->data().get(), 1488 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1489 #else 1490 tempGPU->data().get());PetscCallCUSPARSE(stat); 1491 #endif 1492 1493 /* Then, solve L */ 1494 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1495 loTriFactorT->csrMat->num_rows, 1496 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1497 loTriFactorT->csrMat->num_entries, 1498 #endif 1499 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1500 loTriFactorT->csrMat->values->data().get(), 1501 loTriFactorT->csrMat->row_offsets->data().get(), 1502 loTriFactorT->csrMat->column_indices->data().get(), 1503 loTriFactorT->solveInfo, 1504 tempGPU->data().get(), 1505 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1506 xarray, 1507 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1508 #else 1509 xarray);PetscCallCUSPARSE(stat); 1510 #endif 1511 1512 /* restore */ 1513 PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 1514 PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 1515 PetscCall(PetscLogGpuTimeEnd()); 1516 PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1517 PetscFunctionReturn(0); 1518 } 1519 1520 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1521 { 1522 const PetscScalar *barray; 1523 PetscScalar *xarray; 1524 thrust::device_ptr<const PetscScalar> bGPU; 1525 thrust::device_ptr<PetscScalar> xGPU; 1526 cusparseStatus_t stat; 1527 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1528 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1529 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1530 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1531 1532 PetscFunctionBegin; 1533 1534 /* Get the GPU pointers */ 1535 PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 1536 PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1537 xGPU = thrust::device_pointer_cast(xarray); 1538 bGPU = thrust::device_pointer_cast(barray); 1539 1540 PetscCall(PetscLogGpuTimeBegin()); 1541 /* First, reorder with the row permutation */ 1542 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1543 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1544 tempGPU->begin()); 1545 1546 /* Next, solve L */ 1547 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1548 loTriFactor->csrMat->num_rows, 1549 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1550 loTriFactor->csrMat->num_entries, 1551 #endif 1552 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1553 loTriFactor->csrMat->values->data().get(), 1554 loTriFactor->csrMat->row_offsets->data().get(), 1555 loTriFactor->csrMat->column_indices->data().get(), 1556 loTriFactor->solveInfo, 1557 tempGPU->data().get(), 1558 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1559 xarray, 1560 loTriFactor->solvePolicy, loTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1561 #else 1562 xarray);PetscCallCUSPARSE(stat); 1563 #endif 1564 1565 /* Then, solve U */ 1566 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1567 upTriFactor->csrMat->num_rows, 1568 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1569 upTriFactor->csrMat->num_entries, 1570 #endif 1571 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1572 upTriFactor->csrMat->values->data().get(), 1573 upTriFactor->csrMat->row_offsets->data().get(), 1574 upTriFactor->csrMat->column_indices->data().get(), 1575 upTriFactor->solveInfo,xarray, 1576 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1577 tempGPU->data().get(), 1578 upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1579 #else 1580 tempGPU->data().get());PetscCallCUSPARSE(stat); 1581 #endif 1582 1583 /* Last, reorder with the column permutation */ 1584 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1585 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1586 xGPU); 1587 1588 PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 1589 PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 1590 PetscCall(PetscLogGpuTimeEnd()); 1591 PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1592 PetscFunctionReturn(0); 1593 } 1594 1595 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1596 { 1597 const PetscScalar *barray; 1598 PetscScalar *xarray; 1599 cusparseStatus_t stat; 1600 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1601 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1602 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1603 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1604 1605 PetscFunctionBegin; 1606 /* Get the GPU pointers */ 1607 PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 1608 PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1609 1610 PetscCall(PetscLogGpuTimeBegin()); 1611 /* First, solve L */ 1612 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1613 loTriFactor->csrMat->num_rows, 1614 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1615 loTriFactor->csrMat->num_entries, 1616 #endif 1617 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1618 loTriFactor->csrMat->values->data().get(), 1619 loTriFactor->csrMat->row_offsets->data().get(), 1620 loTriFactor->csrMat->column_indices->data().get(), 1621 loTriFactor->solveInfo, 1622 barray, 1623 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1624 tempGPU->data().get(), 1625 loTriFactor->solvePolicy,loTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1626 #else 1627 tempGPU->data().get());PetscCallCUSPARSE(stat); 1628 #endif 1629 1630 /* Next, solve U */ 1631 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1632 upTriFactor->csrMat->num_rows, 1633 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1634 upTriFactor->csrMat->num_entries, 1635 #endif 1636 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1637 upTriFactor->csrMat->values->data().get(), 1638 upTriFactor->csrMat->row_offsets->data().get(), 1639 upTriFactor->csrMat->column_indices->data().get(), 1640 upTriFactor->solveInfo, 1641 tempGPU->data().get(), 1642 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1643 xarray, 1644 upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1645 #else 1646 xarray);PetscCallCUSPARSE(stat); 1647 #endif 1648 1649 PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 1650 PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 1651 PetscCall(PetscLogGpuTimeEnd()); 1652 PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1653 PetscFunctionReturn(0); 1654 } 1655 1656 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 1657 { 1658 PetscFunctionBegin; 1659 *type = MATSOLVERCUSPARSE; 1660 PetscFunctionReturn(0); 1661 } 1662 1663 /*MC 1664 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 1665 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 1666 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 1667 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 1668 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 1669 algorithms are not recommended. This class does NOT support direct solver operations. 1670 1671 Level: beginner 1672 1673 .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 1674 M*/ 1675 1676 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 1677 { 1678 PetscInt n = A->rmap->n; 1679 1680 PetscFunctionBegin; 1681 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B)); 1682 PetscCall(MatSetSizes(*B,n,n,n,n)); 1683 (*B)->factortype = ftype; 1684 PetscCall(MatSetType(*B,MATSEQAIJCUSPARSE)); 1685 1686 if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B,PETSC_TRUE)); 1687 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 1688 PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 1689 if (!A->boundtocpu) { 1690 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 1691 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 1692 } else { 1693 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 1694 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 1695 } 1696 PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU])); 1697 PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU])); 1698 PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 1699 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 1700 if (!A->boundtocpu) { 1701 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 1702 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 1703 } else { 1704 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 1705 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 1706 } 1707 PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 1708 PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC])); 1709 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 1710 1711 PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL)); 1712 (*B)->canuseordering = PETSC_TRUE; 1713 PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse)); 1714 PetscFunctionReturn(0); 1715 } 1716 1717 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1718 { 1719 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1720 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1721 1722 PetscFunctionBegin; 1723 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1724 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1725 1726 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0)); 1727 PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 1728 PetscCallCUDA(WaitForCUDA()); 1729 PetscCall(PetscLogGpuToCpu(a->nz*sizeof(PetscScalar))); 1730 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0)); 1731 A->offloadmask = PETSC_OFFLOAD_BOTH; 1732 } 1733 PetscFunctionReturn(0); 1734 } 1735 1736 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1737 { 1738 PetscFunctionBegin; 1739 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 1740 *array = ((Mat_SeqAIJ*)A->data)->a; 1741 PetscFunctionReturn(0); 1742 } 1743 1744 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1745 { 1746 PetscFunctionBegin; 1747 A->offloadmask = PETSC_OFFLOAD_CPU; 1748 *array = NULL; 1749 PetscFunctionReturn(0); 1750 } 1751 1752 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 1753 { 1754 PetscFunctionBegin; 1755 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 1756 *array = ((Mat_SeqAIJ*)A->data)->a; 1757 PetscFunctionReturn(0); 1758 } 1759 1760 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 1761 { 1762 PetscFunctionBegin; 1763 *array = NULL; 1764 PetscFunctionReturn(0); 1765 } 1766 1767 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1768 { 1769 PetscFunctionBegin; 1770 *array = ((Mat_SeqAIJ*)A->data)->a; 1771 PetscFunctionReturn(0); 1772 } 1773 1774 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1775 { 1776 PetscFunctionBegin; 1777 A->offloadmask = PETSC_OFFLOAD_CPU; 1778 *array = NULL; 1779 PetscFunctionReturn(0); 1780 } 1781 1782 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype) 1783 { 1784 Mat_SeqAIJCUSPARSE *cusp; 1785 CsrMatrix *matrix; 1786 1787 PetscFunctionBegin; 1788 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1789 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1790 cusp = static_cast<Mat_SeqAIJCUSPARSE*>(A->spptr); 1791 PetscCheck(cusp != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"cusp is NULL"); 1792 matrix = (CsrMatrix*)cusp->mat->mat; 1793 1794 if (i) { 1795 #if !defined(PETSC_USE_64BIT_INDICES) 1796 *i = matrix->row_offsets->data().get(); 1797 #else 1798 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices"); 1799 #endif 1800 } 1801 if (j) { 1802 #if !defined(PETSC_USE_64BIT_INDICES) 1803 *j = matrix->column_indices->data().get(); 1804 #else 1805 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices"); 1806 #endif 1807 } 1808 if (a) *a = matrix->values->data().get(); 1809 if (mtype) *mtype = PETSC_MEMTYPE_CUDA; 1810 PetscFunctionReturn(0); 1811 } 1812 1813 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1814 { 1815 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1816 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1817 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1818 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1819 cusparseStatus_t stat; 1820 PetscBool both = PETSC_TRUE; 1821 1822 PetscFunctionBegin; 1823 PetscCheck(!A->boundtocpu,PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU"); 1824 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1825 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1826 CsrMatrix *matrix; 1827 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1828 1829 PetscCheck(!a->nz || a->a,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values"); 1830 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0)); 1831 matrix->values->assign(a->a, a->a+a->nz); 1832 PetscCallCUDA(WaitForCUDA()); 1833 PetscCall(PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar))); 1834 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0)); 1835 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 1836 } else { 1837 PetscInt nnz; 1838 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0)); 1839 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format)); 1840 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 1841 delete cusparsestruct->workVector; 1842 delete cusparsestruct->rowoffsets_gpu; 1843 cusparsestruct->workVector = NULL; 1844 cusparsestruct->rowoffsets_gpu = NULL; 1845 try { 1846 if (a->compressedrow.use) { 1847 m = a->compressedrow.nrows; 1848 ii = a->compressedrow.i; 1849 ridx = a->compressedrow.rindex; 1850 } else { 1851 m = A->rmap->n; 1852 ii = a->i; 1853 ridx = NULL; 1854 } 1855 PetscCheck(ii,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data"); 1856 if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1857 else nnz = a->nz; 1858 PetscCheck(!nnz || a->j,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data"); 1859 1860 /* create cusparse matrix */ 1861 cusparsestruct->nrows = m; 1862 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1863 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr)); 1864 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO)); 1865 PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1866 1867 PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar))); 1868 PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar))); 1869 PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar))); 1870 PetscCallCUDA(cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1871 PetscCallCUDA(cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1872 PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1873 PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 1874 1875 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1876 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1877 /* set the matrix */ 1878 CsrMatrix *mat= new CsrMatrix; 1879 mat->num_rows = m; 1880 mat->num_cols = A->cmap->n; 1881 mat->num_entries = nnz; 1882 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1883 mat->row_offsets->assign(ii, ii + m+1); 1884 1885 mat->column_indices = new THRUSTINTARRAY32(nnz); 1886 mat->column_indices->assign(a->j, a->j+nnz); 1887 1888 mat->values = new THRUSTARRAY(nnz); 1889 if (a->a) mat->values->assign(a->a, a->a+nnz); 1890 1891 /* assign the pointer */ 1892 matstruct->mat = mat; 1893 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1894 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1895 stat = cusparseCreateCsr(&matstruct->matDescr, 1896 mat->num_rows, mat->num_cols, mat->num_entries, 1897 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1898 mat->values->data().get(), 1899 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1900 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat); 1901 } 1902 #endif 1903 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1904 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1905 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1906 #else 1907 CsrMatrix *mat= new CsrMatrix; 1908 mat->num_rows = m; 1909 mat->num_cols = A->cmap->n; 1910 mat->num_entries = nnz; 1911 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1912 mat->row_offsets->assign(ii, ii + m+1); 1913 1914 mat->column_indices = new THRUSTINTARRAY32(nnz); 1915 mat->column_indices->assign(a->j, a->j+nnz); 1916 1917 mat->values = new THRUSTARRAY(nnz); 1918 if (a->a) mat->values->assign(a->a, a->a+nnz); 1919 1920 cusparseHybMat_t hybMat; 1921 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 1922 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1923 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1924 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1925 matstruct->descr, mat->values->data().get(), 1926 mat->row_offsets->data().get(), 1927 mat->column_indices->data().get(), 1928 hybMat, 0, partition);PetscCallCUSPARSE(stat); 1929 /* assign the pointer */ 1930 matstruct->mat = hybMat; 1931 1932 if (mat) { 1933 if (mat->values) delete (THRUSTARRAY*)mat->values; 1934 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1935 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1936 delete (CsrMatrix*)mat; 1937 } 1938 #endif 1939 } 1940 1941 /* assign the compressed row indices */ 1942 if (a->compressedrow.use) { 1943 cusparsestruct->workVector = new THRUSTARRAY(m); 1944 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1945 matstruct->cprowIndices->assign(ridx,ridx+m); 1946 tmp = m; 1947 } else { 1948 cusparsestruct->workVector = NULL; 1949 matstruct->cprowIndices = NULL; 1950 tmp = 0; 1951 } 1952 PetscCall(PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar))); 1953 1954 /* assign the pointer */ 1955 cusparsestruct->mat = matstruct; 1956 } catch(char *ex) { 1957 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1958 } 1959 PetscCallCUDA(WaitForCUDA()); 1960 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0)); 1961 cusparsestruct->nonzerostate = A->nonzerostate; 1962 } 1963 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 1964 } 1965 PetscFunctionReturn(0); 1966 } 1967 1968 struct VecCUDAPlusEquals 1969 { 1970 template <typename Tuple> 1971 __host__ __device__ 1972 void operator()(Tuple t) 1973 { 1974 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1975 } 1976 }; 1977 1978 struct VecCUDAEquals 1979 { 1980 template <typename Tuple> 1981 __host__ __device__ 1982 void operator()(Tuple t) 1983 { 1984 thrust::get<1>(t) = thrust::get<0>(t); 1985 } 1986 }; 1987 1988 struct VecCUDAEqualsReverse 1989 { 1990 template <typename Tuple> 1991 __host__ __device__ 1992 void operator()(Tuple t) 1993 { 1994 thrust::get<0>(t) = thrust::get<1>(t); 1995 } 1996 }; 1997 1998 struct MatMatCusparse { 1999 PetscBool cisdense; 2000 PetscScalar *Bt; 2001 Mat X; 2002 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2003 PetscLogDouble flops; 2004 CsrMatrix *Bcsr; 2005 2006 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2007 cusparseSpMatDescr_t matSpBDescr; 2008 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2009 cusparseDnMatDescr_t matBDescr; 2010 cusparseDnMatDescr_t matCDescr; 2011 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 2012 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2013 void *dBuffer4; 2014 void *dBuffer5; 2015 #endif 2016 size_t mmBufferSize; 2017 void *mmBuffer; 2018 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2019 cusparseSpGEMMDescr_t spgemmDesc; 2020 #endif 2021 }; 2022 2023 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2024 { 2025 MatMatCusparse *mmdata = (MatMatCusparse *)data; 2026 2027 PetscFunctionBegin; 2028 PetscCallCUDA(cudaFree(mmdata->Bt)); 2029 delete mmdata->Bcsr; 2030 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2031 if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr)); 2032 if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2033 if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2034 if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2035 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2036 if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4)); 2037 if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5)); 2038 #endif 2039 if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2040 if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2)); 2041 #endif 2042 PetscCall(MatDestroy(&mmdata->X)); 2043 PetscCall(PetscFree(data)); 2044 PetscFunctionReturn(0); 2045 } 2046 2047 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 2048 2049 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2050 { 2051 Mat_Product *product = C->product; 2052 Mat A,B; 2053 PetscInt m,n,blda,clda; 2054 PetscBool flg,biscuda; 2055 Mat_SeqAIJCUSPARSE *cusp; 2056 cusparseStatus_t stat; 2057 cusparseOperation_t opA; 2058 const PetscScalar *barray; 2059 PetscScalar *carray; 2060 MatMatCusparse *mmdata; 2061 Mat_SeqAIJCUSPARSEMultStruct *mat; 2062 CsrMatrix *csrmat; 2063 2064 PetscFunctionBegin; 2065 MatCheckProduct(C,1); 2066 PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2067 mmdata = (MatMatCusparse*)product->data; 2068 A = product->A; 2069 B = product->B; 2070 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2071 PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2072 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2073 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2074 PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2075 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2076 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2077 switch (product->type) { 2078 case MATPRODUCT_AB: 2079 case MATPRODUCT_PtAP: 2080 mat = cusp->mat; 2081 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2082 m = A->rmap->n; 2083 n = B->cmap->n; 2084 break; 2085 case MATPRODUCT_AtB: 2086 if (!A->form_explicit_transpose) { 2087 mat = cusp->mat; 2088 opA = CUSPARSE_OPERATION_TRANSPOSE; 2089 } else { 2090 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2091 mat = cusp->matTranspose; 2092 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2093 } 2094 m = A->cmap->n; 2095 n = B->cmap->n; 2096 break; 2097 case MATPRODUCT_ABt: 2098 case MATPRODUCT_RARt: 2099 mat = cusp->mat; 2100 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2101 m = A->rmap->n; 2102 n = B->rmap->n; 2103 break; 2104 default: 2105 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2106 } 2107 PetscCheck(mat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2108 csrmat = (CsrMatrix*)mat->mat; 2109 /* if the user passed a CPU matrix, copy the data to the GPU */ 2110 PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda)); 2111 if (!biscuda) PetscCall(MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B)); 2112 PetscCall(MatDenseCUDAGetArrayRead(B,&barray)); 2113 2114 PetscCall(MatDenseGetLDA(B,&blda)); 2115 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2116 PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X,&carray)); 2117 PetscCall(MatDenseGetLDA(mmdata->X,&clda)); 2118 } else { 2119 PetscCall(MatDenseCUDAGetArrayWrite(C,&carray)); 2120 PetscCall(MatDenseGetLDA(C,&clda)); 2121 } 2122 2123 PetscCall(PetscLogGpuTimeBegin()); 2124 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2125 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2126 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2127 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2128 size_t mmBufferSize; 2129 if (mmdata->initialized && mmdata->Blda != blda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); mmdata->matBDescr = NULL;} 2130 if (!mmdata->matBDescr) { 2131 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL)); 2132 mmdata->Blda = blda; 2133 } 2134 2135 if (mmdata->initialized && mmdata->Clda != clda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); mmdata->matCDescr = NULL;} 2136 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2137 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL)); 2138 mmdata->Clda = clda; 2139 } 2140 2141 if (!mat->matDescr) { 2142 stat = cusparseCreateCsr(&mat->matDescr, 2143 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2144 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2145 csrmat->values->data().get(), 2146 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2147 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat); 2148 } 2149 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2150 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2151 mmdata->matCDescr,cusparse_scalartype, 2152 cusp->spmmAlg,&mmBufferSize);PetscCallCUSPARSE(stat); 2153 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2154 PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2155 PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer,mmBufferSize)); 2156 mmdata->mmBufferSize = mmBufferSize; 2157 } 2158 mmdata->initialized = PETSC_TRUE; 2159 } else { 2160 /* to be safe, always update pointers of the mats */ 2161 PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get())); 2162 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray)); 2163 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray)); 2164 } 2165 2166 /* do cusparseSpMM, which supports transpose on B */ 2167 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2168 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2169 mmdata->matCDescr,cusparse_scalartype, 2170 cusp->spmmAlg,mmdata->mmBuffer);PetscCallCUSPARSE(stat); 2171 #else 2172 PetscInt k; 2173 /* cusparseXcsrmm does not support transpose on B */ 2174 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2175 cublasHandle_t cublasv2handle; 2176 cublasStatus_t cerr; 2177 2178 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 2179 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2180 B->cmap->n,B->rmap->n, 2181 &PETSC_CUSPARSE_ONE ,barray,blda, 2182 &PETSC_CUSPARSE_ZERO,barray,blda, 2183 mmdata->Bt,B->cmap->n);PetscCallCUBLAS(cerr); 2184 blda = B->cmap->n; 2185 k = B->cmap->n; 2186 } else { 2187 k = B->rmap->n; 2188 } 2189 2190 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2191 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2192 csrmat->num_entries,mat->alpha_one,mat->descr, 2193 csrmat->values->data().get(), 2194 csrmat->row_offsets->data().get(), 2195 csrmat->column_indices->data().get(), 2196 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2197 carray,clda);PetscCallCUSPARSE(stat); 2198 #endif 2199 PetscCall(PetscLogGpuTimeEnd()); 2200 PetscCall(PetscLogGpuFlops(n*2.0*csrmat->num_entries)); 2201 PetscCall(MatDenseCUDARestoreArrayRead(B,&barray)); 2202 if (product->type == MATPRODUCT_RARt) { 2203 PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray)); 2204 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE)); 2205 } else if (product->type == MATPRODUCT_PtAP) { 2206 PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray)); 2207 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE)); 2208 } else { 2209 PetscCall(MatDenseCUDARestoreArrayWrite(C,&carray)); 2210 } 2211 if (mmdata->cisdense) { 2212 PetscCall(MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C)); 2213 } 2214 if (!biscuda) { 2215 PetscCall(MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B)); 2216 } 2217 PetscFunctionReturn(0); 2218 } 2219 2220 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2221 { 2222 Mat_Product *product = C->product; 2223 Mat A,B; 2224 PetscInt m,n; 2225 PetscBool cisdense,flg; 2226 MatMatCusparse *mmdata; 2227 Mat_SeqAIJCUSPARSE *cusp; 2228 2229 PetscFunctionBegin; 2230 MatCheckProduct(C,1); 2231 PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2232 A = product->A; 2233 B = product->B; 2234 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2235 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2236 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2237 PetscCheck(cusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2238 switch (product->type) { 2239 case MATPRODUCT_AB: 2240 m = A->rmap->n; 2241 n = B->cmap->n; 2242 break; 2243 case MATPRODUCT_AtB: 2244 m = A->cmap->n; 2245 n = B->cmap->n; 2246 break; 2247 case MATPRODUCT_ABt: 2248 m = A->rmap->n; 2249 n = B->rmap->n; 2250 break; 2251 case MATPRODUCT_PtAP: 2252 m = B->cmap->n; 2253 n = B->cmap->n; 2254 break; 2255 case MATPRODUCT_RARt: 2256 m = B->rmap->n; 2257 n = B->rmap->n; 2258 break; 2259 default: 2260 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2261 } 2262 PetscCall(MatSetSizes(C,m,n,m,n)); 2263 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2264 PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense)); 2265 PetscCall(MatSetType(C,MATSEQDENSECUDA)); 2266 2267 /* product data */ 2268 PetscCall(PetscNew(&mmdata)); 2269 mmdata->cisdense = cisdense; 2270 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2271 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2272 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2273 PetscCallCUDA(cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar))); 2274 } 2275 #endif 2276 /* for these products we need intermediate storage */ 2277 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2278 PetscCall(MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X)); 2279 PetscCall(MatSetType(mmdata->X,MATSEQDENSECUDA)); 2280 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2281 PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n)); 2282 } else { 2283 PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n)); 2284 } 2285 } 2286 C->product->data = mmdata; 2287 C->product->destroy = MatDestroy_MatMatCusparse; 2288 2289 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2290 PetscFunctionReturn(0); 2291 } 2292 2293 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2294 { 2295 Mat_Product *product = C->product; 2296 Mat A,B; 2297 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2298 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2299 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2300 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2301 PetscBool flg; 2302 cusparseStatus_t stat; 2303 MatProductType ptype; 2304 MatMatCusparse *mmdata; 2305 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2306 cusparseSpMatDescr_t BmatSpDescr; 2307 #endif 2308 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2309 2310 PetscFunctionBegin; 2311 MatCheckProduct(C,1); 2312 PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2313 PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg)); 2314 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name); 2315 mmdata = (MatMatCusparse*)C->product->data; 2316 A = product->A; 2317 B = product->B; 2318 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2319 mmdata->reusesym = PETSC_FALSE; 2320 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2321 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2322 Cmat = Ccusp->mat; 2323 PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2324 Ccsr = (CsrMatrix*)Cmat->mat; 2325 PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2326 goto finalize; 2327 } 2328 if (!c->nz) goto finalize; 2329 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2330 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2331 PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg)); 2332 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2333 PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2334 PetscCheck(!B->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2335 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2336 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2337 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2338 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2339 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2340 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2341 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2342 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2343 2344 ptype = product->type; 2345 if (A->symmetric && ptype == MATPRODUCT_AtB) { 2346 ptype = MATPRODUCT_AB; 2347 PetscCheck(product->symbolic_used_the_fact_A_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric"); 2348 } 2349 if (B->symmetric && ptype == MATPRODUCT_ABt) { 2350 ptype = MATPRODUCT_AB; 2351 PetscCheck(product->symbolic_used_the_fact_B_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric"); 2352 } 2353 switch (ptype) { 2354 case MATPRODUCT_AB: 2355 Amat = Acusp->mat; 2356 Bmat = Bcusp->mat; 2357 break; 2358 case MATPRODUCT_AtB: 2359 Amat = Acusp->matTranspose; 2360 Bmat = Bcusp->mat; 2361 break; 2362 case MATPRODUCT_ABt: 2363 Amat = Acusp->mat; 2364 Bmat = Bcusp->matTranspose; 2365 break; 2366 default: 2367 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2368 } 2369 Cmat = Ccusp->mat; 2370 PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2371 PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2372 PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2373 Acsr = (CsrMatrix*)Amat->mat; 2374 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2375 Ccsr = (CsrMatrix*)Cmat->mat; 2376 PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2377 PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2378 PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2379 PetscCall(PetscLogGpuTimeBegin()); 2380 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2381 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2382 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2383 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2384 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2385 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2386 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2387 mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 2388 #else 2389 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2390 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2391 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2392 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat); 2393 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2394 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2395 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 2396 #endif 2397 #else 2398 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2399 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2400 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2401 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2402 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat); 2403 #endif 2404 PetscCall(PetscLogGpuFlops(mmdata->flops)); 2405 PetscCallCUDA(WaitForCUDA()); 2406 PetscCall(PetscLogGpuTimeEnd()); 2407 C->offloadmask = PETSC_OFFLOAD_GPU; 2408 finalize: 2409 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2410 PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz)); 2411 PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n")); 2412 PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax)); 2413 c->reallocs = 0; 2414 C->info.mallocs += 0; 2415 C->info.nz_unneeded = 0; 2416 C->assembled = C->was_assembled = PETSC_TRUE; 2417 C->num_ass++; 2418 PetscFunctionReturn(0); 2419 } 2420 2421 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2422 { 2423 Mat_Product *product = C->product; 2424 Mat A,B; 2425 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2426 Mat_SeqAIJ *a,*b,*c; 2427 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2428 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2429 PetscInt i,j,m,n,k; 2430 PetscBool flg; 2431 cusparseStatus_t stat; 2432 MatProductType ptype; 2433 MatMatCusparse *mmdata; 2434 PetscLogDouble flops; 2435 PetscBool biscompressed,ciscompressed; 2436 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2437 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2438 cusparseSpMatDescr_t BmatSpDescr; 2439 #else 2440 int cnz; 2441 #endif 2442 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2443 2444 PetscFunctionBegin; 2445 MatCheckProduct(C,1); 2446 PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2447 A = product->A; 2448 B = product->B; 2449 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2450 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2451 PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg)); 2452 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2453 a = (Mat_SeqAIJ*)A->data; 2454 b = (Mat_SeqAIJ*)B->data; 2455 /* product data */ 2456 PetscCall(PetscNew(&mmdata)); 2457 C->product->data = mmdata; 2458 C->product->destroy = MatDestroy_MatMatCusparse; 2459 2460 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2461 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2462 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 2463 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2464 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2465 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2466 2467 ptype = product->type; 2468 if (A->symmetric && ptype == MATPRODUCT_AtB) { 2469 ptype = MATPRODUCT_AB; 2470 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 2471 } 2472 if (B->symmetric && ptype == MATPRODUCT_ABt) { 2473 ptype = MATPRODUCT_AB; 2474 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 2475 } 2476 biscompressed = PETSC_FALSE; 2477 ciscompressed = PETSC_FALSE; 2478 switch (ptype) { 2479 case MATPRODUCT_AB: 2480 m = A->rmap->n; 2481 n = B->cmap->n; 2482 k = A->cmap->n; 2483 Amat = Acusp->mat; 2484 Bmat = Bcusp->mat; 2485 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2486 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2487 break; 2488 case MATPRODUCT_AtB: 2489 m = A->cmap->n; 2490 n = B->cmap->n; 2491 k = A->rmap->n; 2492 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2493 Amat = Acusp->matTranspose; 2494 Bmat = Bcusp->mat; 2495 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2496 break; 2497 case MATPRODUCT_ABt: 2498 m = A->rmap->n; 2499 n = B->rmap->n; 2500 k = A->cmap->n; 2501 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 2502 Amat = Acusp->mat; 2503 Bmat = Bcusp->matTranspose; 2504 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2505 break; 2506 default: 2507 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2508 } 2509 2510 /* create cusparse matrix */ 2511 PetscCall(MatSetSizes(C,m,n,m,n)); 2512 PetscCall(MatSetType(C,MATSEQAIJCUSPARSE)); 2513 c = (Mat_SeqAIJ*)C->data; 2514 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2515 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2516 Ccsr = new CsrMatrix; 2517 2518 c->compressedrow.use = ciscompressed; 2519 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2520 c->compressedrow.nrows = a->compressedrow.nrows; 2521 PetscCall(PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex)); 2522 PetscCall(PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows)); 2523 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2524 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2525 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2526 } else { 2527 c->compressedrow.nrows = 0; 2528 c->compressedrow.i = NULL; 2529 c->compressedrow.rindex = NULL; 2530 Ccusp->workVector = NULL; 2531 Cmat->cprowIndices = NULL; 2532 } 2533 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2534 Ccusp->mat = Cmat; 2535 Ccusp->mat->mat = Ccsr; 2536 Ccsr->num_rows = Ccusp->nrows; 2537 Ccsr->num_cols = n; 2538 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2539 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 2540 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 2541 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 2542 PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar))); 2543 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar))); 2544 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 2545 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 2546 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 2547 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 2548 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2549 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2550 c->nz = 0; 2551 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2552 Ccsr->values = new THRUSTARRAY(c->nz); 2553 goto finalizesym; 2554 } 2555 2556 PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2557 PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2558 Acsr = (CsrMatrix*)Amat->mat; 2559 if (!biscompressed) { 2560 Bcsr = (CsrMatrix*)Bmat->mat; 2561 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2562 BmatSpDescr = Bmat->matDescr; 2563 #endif 2564 } else { /* we need to use row offsets for the full matrix */ 2565 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2566 Bcsr = new CsrMatrix; 2567 Bcsr->num_rows = B->rmap->n; 2568 Bcsr->num_cols = cBcsr->num_cols; 2569 Bcsr->num_entries = cBcsr->num_entries; 2570 Bcsr->column_indices = cBcsr->column_indices; 2571 Bcsr->values = cBcsr->values; 2572 if (!Bcusp->rowoffsets_gpu) { 2573 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2574 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2575 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt))); 2576 } 2577 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2578 mmdata->Bcsr = Bcsr; 2579 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2580 if (Bcsr->num_rows && Bcsr->num_cols) { 2581 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2582 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2583 Bcsr->values->data().get(), 2584 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2585 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 2586 } 2587 BmatSpDescr = mmdata->matSpBDescr; 2588 #endif 2589 } 2590 PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2591 PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2592 /* precompute flops count */ 2593 if (ptype == MATPRODUCT_AB) { 2594 for (i=0, flops = 0; i<A->rmap->n; i++) { 2595 const PetscInt st = a->i[i]; 2596 const PetscInt en = a->i[i+1]; 2597 for (j=st; j<en; j++) { 2598 const PetscInt brow = a->j[j]; 2599 flops += 2.*(b->i[brow+1] - b->i[brow]); 2600 } 2601 } 2602 } else if (ptype == MATPRODUCT_AtB) { 2603 for (i=0, flops = 0; i<A->rmap->n; i++) { 2604 const PetscInt anzi = a->i[i+1] - a->i[i]; 2605 const PetscInt bnzi = b->i[i+1] - b->i[i]; 2606 flops += (2.*anzi)*bnzi; 2607 } 2608 } else { /* TODO */ 2609 flops = 0.; 2610 } 2611 2612 mmdata->flops = flops; 2613 PetscCall(PetscLogGpuTimeBegin()); 2614 2615 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2616 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2617 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2618 NULL, NULL, NULL, 2619 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2620 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 2621 PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 2622 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2623 { 2624 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 2625 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 2626 */ 2627 void* dBuffer1 = NULL; 2628 void* dBuffer2 = NULL; 2629 void* dBuffer3 = NULL; 2630 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 2631 size_t bufferSize1 = 0; 2632 size_t bufferSize2 = 0; 2633 size_t bufferSize3 = 0; 2634 size_t bufferSize4 = 0; 2635 size_t bufferSize5 = 0; 2636 2637 /*----------------------------------------------------------------------*/ 2638 /* ask bufferSize1 bytes for external memory */ 2639 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2640 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2641 &bufferSize1, NULL);PetscCallCUSPARSE(stat); 2642 PetscCallCUDA(cudaMalloc((void**) &dBuffer1, bufferSize1)); 2643 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2644 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2645 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2646 &bufferSize1, dBuffer1);PetscCallCUSPARSE(stat); 2647 2648 /*----------------------------------------------------------------------*/ 2649 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2650 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2651 &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);PetscCallCUSPARSE(stat); 2652 PetscCallCUDA(cudaMalloc((void**) &dBuffer2, bufferSize2)); 2653 PetscCallCUDA(cudaMalloc((void**) &dBuffer3, bufferSize3)); 2654 PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4)); 2655 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2656 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2657 &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);PetscCallCUSPARSE(stat); 2658 PetscCallCUDA(cudaFree(dBuffer1)); 2659 PetscCallCUDA(cudaFree(dBuffer2)); 2660 2661 /*----------------------------------------------------------------------*/ 2662 /* get matrix C non-zero entries C_nnz1 */ 2663 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 2664 c->nz = (PetscInt) C_nnz1; 2665 /* allocate matrix C */ 2666 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2667 Ccsr->values = new THRUSTARRAY(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2668 /* update matC with the new pointers */ 2669 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2670 Ccsr->values->data().get());PetscCallCUSPARSE(stat); 2671 2672 /*----------------------------------------------------------------------*/ 2673 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2674 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2675 &bufferSize5, NULL);PetscCallCUSPARSE(stat); 2676 PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5)); 2677 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2678 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2679 &bufferSize5, mmdata->dBuffer5);PetscCallCUSPARSE(stat); 2680 PetscCallCUDA(cudaFree(dBuffer3)); 2681 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2682 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2683 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2684 mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 2685 PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024)); 2686 } 2687 #else 2688 size_t bufSize2; 2689 /* ask bufferSize bytes for external memory */ 2690 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2691 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2692 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2693 mmdata->spgemmDesc, &bufSize2, NULL);PetscCallCUSPARSE(stat); 2694 PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2)); 2695 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2696 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2697 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2698 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2699 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);PetscCallCUSPARSE(stat); 2700 /* ask bufferSize again bytes for external memory */ 2701 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2702 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2703 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2704 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);PetscCallCUSPARSE(stat); 2705 /* The CUSPARSE documentation is not clear, nor the API 2706 We need both buffers to perform the operations properly! 2707 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2708 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2709 is stored in the descriptor! What a messy API... */ 2710 PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize)); 2711 /* compute the intermediate product of A * B */ 2712 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2713 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2714 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2715 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat); 2716 /* get matrix C non-zero entries C_nnz1 */ 2717 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 2718 c->nz = (PetscInt) C_nnz1; 2719 PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024)); 2720 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2721 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2722 Ccsr->values = new THRUSTARRAY(c->nz); 2723 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2724 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2725 Ccsr->values->data().get());PetscCallCUSPARSE(stat); 2726 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2727 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2728 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 2729 #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2730 #else 2731 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST)); 2732 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, 2733 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2734 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2735 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2736 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);PetscCallCUSPARSE(stat); 2737 c->nz = cnz; 2738 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2739 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2740 Ccsr->values = new THRUSTARRAY(c->nz); 2741 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2742 2743 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2744 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2745 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2746 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2747 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2748 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2749 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2750 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2751 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat); 2752 #endif 2753 PetscCall(PetscLogGpuFlops(mmdata->flops)); 2754 PetscCall(PetscLogGpuTimeEnd()); 2755 finalizesym: 2756 c->singlemalloc = PETSC_FALSE; 2757 c->free_a = PETSC_TRUE; 2758 c->free_ij = PETSC_TRUE; 2759 PetscCall(PetscMalloc1(m+1,&c->i)); 2760 PetscCall(PetscMalloc1(c->nz,&c->j)); 2761 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2762 PetscInt *d_i = c->i; 2763 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2764 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2765 ii = *Ccsr->row_offsets; 2766 jj = *Ccsr->column_indices; 2767 if (ciscompressed) d_i = c->compressedrow.i; 2768 PetscCallCUDA(cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 2769 PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 2770 } else { 2771 PetscInt *d_i = c->i; 2772 if (ciscompressed) d_i = c->compressedrow.i; 2773 PetscCallCUDA(cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 2774 PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 2775 } 2776 if (ciscompressed) { /* need to expand host row offsets */ 2777 PetscInt r = 0; 2778 c->i[0] = 0; 2779 for (k = 0; k < c->compressedrow.nrows; k++) { 2780 const PetscInt next = c->compressedrow.rindex[k]; 2781 const PetscInt old = c->compressedrow.i[k]; 2782 for (; r < next; r++) c->i[r+1] = old; 2783 } 2784 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2785 } 2786 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt))); 2787 PetscCall(PetscMalloc1(m,&c->ilen)); 2788 PetscCall(PetscMalloc1(m,&c->imax)); 2789 c->maxnz = c->nz; 2790 c->nonzerorowcnt = 0; 2791 c->rmax = 0; 2792 for (k = 0; k < m; k++) { 2793 const PetscInt nn = c->i[k+1] - c->i[k]; 2794 c->ilen[k] = c->imax[k] = nn; 2795 c->nonzerorowcnt += (PetscInt)!!nn; 2796 c->rmax = PetscMax(c->rmax,nn); 2797 } 2798 PetscCall(MatMarkDiagonal_SeqAIJ(C)); 2799 PetscCall(PetscMalloc1(c->nz,&c->a)); 2800 Ccsr->num_entries = c->nz; 2801 2802 C->nonzerostate++; 2803 PetscCall(PetscLayoutSetUp(C->rmap)); 2804 PetscCall(PetscLayoutSetUp(C->cmap)); 2805 Ccusp->nonzerostate = C->nonzerostate; 2806 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2807 C->preallocated = PETSC_TRUE; 2808 C->assembled = PETSC_FALSE; 2809 C->was_assembled = PETSC_FALSE; 2810 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2811 mmdata->reusesym = PETSC_TRUE; 2812 C->offloadmask = PETSC_OFFLOAD_GPU; 2813 } 2814 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2815 PetscFunctionReturn(0); 2816 } 2817 2818 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2819 2820 /* handles sparse or dense B */ 2821 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2822 { 2823 Mat_Product *product = mat->product; 2824 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2825 2826 PetscFunctionBegin; 2827 MatCheckProduct(mat,1); 2828 PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense)); 2829 if (!product->A->boundtocpu && !product->B->boundtocpu) { 2830 PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp)); 2831 } 2832 if (product->type == MATPRODUCT_ABC) { 2833 Ciscusp = PETSC_FALSE; 2834 if (!product->C->boundtocpu) { 2835 PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp)); 2836 } 2837 } 2838 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 2839 PetscBool usecpu = PETSC_FALSE; 2840 switch (product->type) { 2841 case MATPRODUCT_AB: 2842 if (product->api_user) { 2843 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat"); 2844 PetscCall(PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL)); 2845 PetscOptionsEnd(); 2846 } else { 2847 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat"); 2848 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL)); 2849 PetscOptionsEnd(); 2850 } 2851 break; 2852 case MATPRODUCT_AtB: 2853 if (product->api_user) { 2854 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat"); 2855 PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL)); 2856 PetscOptionsEnd(); 2857 } else { 2858 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat"); 2859 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL)); 2860 PetscOptionsEnd(); 2861 } 2862 break; 2863 case MATPRODUCT_PtAP: 2864 if (product->api_user) { 2865 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat"); 2866 PetscCall(PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL)); 2867 PetscOptionsEnd(); 2868 } else { 2869 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat"); 2870 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL)); 2871 PetscOptionsEnd(); 2872 } 2873 break; 2874 case MATPRODUCT_RARt: 2875 if (product->api_user) { 2876 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat"); 2877 PetscCall(PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL)); 2878 PetscOptionsEnd(); 2879 } else { 2880 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat"); 2881 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL)); 2882 PetscOptionsEnd(); 2883 } 2884 break; 2885 case MATPRODUCT_ABC: 2886 if (product->api_user) { 2887 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat"); 2888 PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL)); 2889 PetscOptionsEnd(); 2890 } else { 2891 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat"); 2892 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL)); 2893 PetscOptionsEnd(); 2894 } 2895 break; 2896 default: 2897 break; 2898 } 2899 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 2900 } 2901 /* dispatch */ 2902 if (isdense) { 2903 switch (product->type) { 2904 case MATPRODUCT_AB: 2905 case MATPRODUCT_AtB: 2906 case MATPRODUCT_ABt: 2907 case MATPRODUCT_PtAP: 2908 case MATPRODUCT_RARt: 2909 if (product->A->boundtocpu) { 2910 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 2911 } else { 2912 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2913 } 2914 break; 2915 case MATPRODUCT_ABC: 2916 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2917 break; 2918 default: 2919 break; 2920 } 2921 } else if (Biscusp && Ciscusp) { 2922 switch (product->type) { 2923 case MATPRODUCT_AB: 2924 case MATPRODUCT_AtB: 2925 case MATPRODUCT_ABt: 2926 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2927 break; 2928 case MATPRODUCT_PtAP: 2929 case MATPRODUCT_RARt: 2930 case MATPRODUCT_ABC: 2931 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2932 break; 2933 default: 2934 break; 2935 } 2936 } else { /* fallback for AIJ */ 2937 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 2938 } 2939 PetscFunctionReturn(0); 2940 } 2941 2942 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2943 { 2944 PetscFunctionBegin; 2945 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE)); 2946 PetscFunctionReturn(0); 2947 } 2948 2949 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2950 { 2951 PetscFunctionBegin; 2952 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE)); 2953 PetscFunctionReturn(0); 2954 } 2955 2956 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2957 { 2958 PetscFunctionBegin; 2959 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE)); 2960 PetscFunctionReturn(0); 2961 } 2962 2963 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2964 { 2965 PetscFunctionBegin; 2966 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE)); 2967 PetscFunctionReturn(0); 2968 } 2969 2970 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2971 { 2972 PetscFunctionBegin; 2973 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE)); 2974 PetscFunctionReturn(0); 2975 } 2976 2977 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 2978 { 2979 int i = blockIdx.x*blockDim.x + threadIdx.x; 2980 if (i < n) y[idx[i]] += x[i]; 2981 } 2982 2983 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2984 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 2985 { 2986 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2987 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2988 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2989 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2990 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2991 PetscBool compressed; 2992 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2993 PetscInt nx,ny; 2994 #endif 2995 2996 PetscFunctionBegin; 2997 PetscCheck(!herm || trans,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported"); 2998 if (!a->nz) { 2999 if (!yy) PetscCall(VecSet_SeqCUDA(zz,0)); 3000 else PetscCall(VecCopy_SeqCUDA(yy,zz)); 3001 PetscFunctionReturn(0); 3002 } 3003 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 3004 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3005 if (!trans) { 3006 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3007 PetscCheck(matstruct,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3008 } else { 3009 if (herm || !A->form_explicit_transpose) { 3010 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3011 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3012 } else { 3013 if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3014 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 3015 } 3016 } 3017 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3018 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3019 3020 try { 3021 PetscCall(VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray)); 3022 if (yy == zz) PetscCall(VecCUDAGetArray(zz,&zarray)); /* read & write zz, so need to get uptodate zarray on GPU */ 3023 else PetscCall(VecCUDAGetArrayWrite(zz,&zarray)); /* write zz, so no need to init zarray on GPU */ 3024 3025 PetscCall(PetscLogGpuTimeBegin()); 3026 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3027 /* z = A x + beta y. 3028 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3029 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3030 */ 3031 xptr = xarray; 3032 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3033 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3034 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3035 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3036 allocated to accommodate different uses. So we get the length info directly from mat. 3037 */ 3038 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3039 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3040 nx = mat->num_cols; 3041 ny = mat->num_rows; 3042 } 3043 #endif 3044 } else { 3045 /* z = A^T x + beta y 3046 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3047 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3048 */ 3049 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3050 dptr = zarray; 3051 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3052 if (compressed) { /* Scatter x to work vector */ 3053 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3054 thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3055 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3056 VecCUDAEqualsReverse()); 3057 } 3058 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3059 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3060 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3061 nx = mat->num_rows; 3062 ny = mat->num_cols; 3063 } 3064 #endif 3065 } 3066 3067 /* csr_spmv does y = alpha op(A) x + beta y */ 3068 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3069 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3070 PetscCheck(opA >= 0 && opA <= 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3071 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3072 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype)); 3073 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype)); 3074 PetscCallCUSPARSE(cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 3075 matstruct->matDescr, 3076 matstruct->cuSpMV[opA].vecXDescr, beta, 3077 matstruct->cuSpMV[opA].vecYDescr, 3078 cusparse_scalartype, 3079 cusparsestruct->spmvAlg, 3080 &matstruct->cuSpMV[opA].spmvBufferSize)); 3081 PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize)); 3082 3083 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3084 } else { 3085 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3086 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr)); 3087 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr)); 3088 } 3089 3090 PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, 3091 matstruct->alpha_one, 3092 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 3093 matstruct->cuSpMV[opA].vecXDescr, 3094 beta, 3095 matstruct->cuSpMV[opA].vecYDescr, 3096 cusparse_scalartype, 3097 cusparsestruct->spmvAlg, 3098 matstruct->cuSpMV[opA].spmvBuffer)); 3099 #else 3100 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3101 PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, 3102 mat->num_rows, mat->num_cols, 3103 mat->num_entries, matstruct->alpha_one, matstruct->descr, 3104 mat->values->data().get(), mat->row_offsets->data().get(), 3105 mat->column_indices->data().get(), xptr, beta, 3106 dptr)); 3107 #endif 3108 } else { 3109 if (cusparsestruct->nrows) { 3110 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3111 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3112 #else 3113 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3114 PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, 3115 matstruct->alpha_one, matstruct->descr, hybMat, 3116 xptr, beta, 3117 dptr)); 3118 #endif 3119 } 3120 } 3121 PetscCall(PetscLogGpuTimeEnd()); 3122 3123 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3124 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3125 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3126 PetscCall(VecCopy_SeqCUDA(yy,zz)); /* zz = yy */ 3127 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3128 PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */ 3129 } 3130 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3131 PetscCall(VecSet_SeqCUDA(zz,0)); 3132 } 3133 3134 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3135 if (compressed) { 3136 PetscCall(PetscLogGpuTimeBegin()); 3137 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3138 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3139 prevent that. So I just add a ScatterAdd kernel. 3140 */ 3141 #if 0 3142 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3143 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3144 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3145 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3146 VecCUDAPlusEquals()); 3147 #else 3148 PetscInt n = matstruct->cprowIndices->size(); 3149 ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 3150 #endif 3151 PetscCall(PetscLogGpuTimeEnd()); 3152 } 3153 } else { 3154 if (yy && yy != zz) { 3155 PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */ 3156 } 3157 } 3158 PetscCall(VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray)); 3159 if (yy == zz) PetscCall(VecCUDARestoreArray(zz,&zarray)); 3160 else PetscCall(VecCUDARestoreArrayWrite(zz,&zarray)); 3161 } catch(char *ex) { 3162 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 3163 } 3164 if (yy) { 3165 PetscCall(PetscLogGpuFlops(2.0*a->nz)); 3166 } else { 3167 PetscCall(PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt)); 3168 } 3169 PetscFunctionReturn(0); 3170 } 3171 3172 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3173 { 3174 PetscFunctionBegin; 3175 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE)); 3176 PetscFunctionReturn(0); 3177 } 3178 3179 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 3180 { 3181 PetscObjectState onnz = A->nonzerostate; 3182 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3183 3184 PetscFunctionBegin; 3185 PetscCall(MatAssemblyEnd_SeqAIJ(A,mode)); 3186 if (onnz != A->nonzerostate && cusp->deviceMat) { 3187 3188 PetscCall(PetscInfo(A,"Destroy device mat since nonzerostate changed\n")); 3189 PetscCallCUDA(cudaFree(cusp->deviceMat)); 3190 cusp->deviceMat = NULL; 3191 } 3192 PetscFunctionReturn(0); 3193 } 3194 3195 /* --------------------------------------------------------------------------------*/ 3196 /*@ 3197 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 3198 (the default parallel PETSc format). This matrix will ultimately pushed down 3199 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 3200 assembly performance the user should preallocate the matrix storage by setting 3201 the parameter nz (or the array nnz). By setting these parameters accurately, 3202 performance during matrix assembly can be increased by more than a factor of 50. 3203 3204 Collective 3205 3206 Input Parameters: 3207 + comm - MPI communicator, set to PETSC_COMM_SELF 3208 . m - number of rows 3209 . n - number of columns 3210 . nz - number of nonzeros per row (same for all rows) 3211 - nnz - array containing the number of nonzeros in the various rows 3212 (possibly different for each row) or NULL 3213 3214 Output Parameter: 3215 . A - the matrix 3216 3217 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 3218 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3219 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 3220 3221 Notes: 3222 If nnz is given then nz is ignored 3223 3224 The AIJ format (also called the Yale sparse matrix format or 3225 compressed row storage), is fully compatible with standard Fortran 77 3226 storage. That is, the stored row and column indices can begin at 3227 either one (as in Fortran) or zero. See the users' manual for details. 3228 3229 Specify the preallocated storage with either nz or nnz (not both). 3230 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 3231 allocation. For large problems you MUST preallocate memory or you 3232 will get TERRIBLE performance, see the users' manual chapter on matrices. 3233 3234 By default, this format uses inodes (identical nodes) when possible, to 3235 improve numerical efficiency of matrix-vector products and solves. We 3236 search for consecutive rows with the same nonzero structure, thereby 3237 reusing matrix information to achieve increased efficiency. 3238 3239 Level: intermediate 3240 3241 .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE` 3242 @*/ 3243 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 3244 { 3245 PetscFunctionBegin; 3246 PetscCall(MatCreate(comm,A)); 3247 PetscCall(MatSetSizes(*A,m,n,m,n)); 3248 PetscCall(MatSetType(*A,MATSEQAIJCUSPARSE)); 3249 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz)); 3250 PetscFunctionReturn(0); 3251 } 3252 3253 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3254 { 3255 PetscFunctionBegin; 3256 if (A->factortype == MAT_FACTOR_NONE) { 3257 PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr)); 3258 } else { 3259 PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr)); 3260 } 3261 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL)); 3262 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL)); 3263 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL)); 3264 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL)); 3265 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL)); 3266 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL)); 3267 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL)); 3268 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 3269 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 3270 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL)); 3271 PetscCall(MatDestroy_SeqAIJ(A)); 3272 PetscFunctionReturn(0); 3273 } 3274 3275 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 3276 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 3277 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 3278 { 3279 PetscFunctionBegin; 3280 PetscCall(MatDuplicate_SeqAIJ(A,cpvalues,B)); 3281 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B)); 3282 PetscFunctionReturn(0); 3283 } 3284 3285 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 3286 { 3287 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3288 Mat_SeqAIJCUSPARSE *cy; 3289 Mat_SeqAIJCUSPARSE *cx; 3290 PetscScalar *ay; 3291 const PetscScalar *ax; 3292 CsrMatrix *csry,*csrx; 3293 3294 PetscFunctionBegin; 3295 cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3296 cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3297 if (X->ops->axpy != Y->ops->axpy) { 3298 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE)); 3299 PetscCall(MatAXPY_SeqAIJ(Y,a,X,str)); 3300 PetscFunctionReturn(0); 3301 } 3302 /* if we are here, it means both matrices are bound to GPU */ 3303 PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y)); 3304 PetscCall(MatSeqAIJCUSPARSECopyToGPU(X)); 3305 PetscCheck(cy->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3306 PetscCheck(cx->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3307 csry = (CsrMatrix*)cy->mat->mat; 3308 csrx = (CsrMatrix*)cx->mat->mat; 3309 /* see if we can turn this into a cublas axpy */ 3310 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3311 bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3312 if (eq) { 3313 eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3314 } 3315 if (eq) str = SAME_NONZERO_PATTERN; 3316 } 3317 /* spgeam is buggy with one column */ 3318 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3319 3320 if (str == SUBSET_NONZERO_PATTERN) { 3321 PetscScalar b = 1.0; 3322 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3323 size_t bufferSize; 3324 void *buffer; 3325 #endif 3326 3327 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax)); 3328 PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 3329 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST)); 3330 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3331 PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3332 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3333 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3334 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize)); 3335 PetscCallCUDA(cudaMalloc(&buffer,bufferSize)); 3336 PetscCall(PetscLogGpuTimeBegin()); 3337 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3338 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3339 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3340 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer)); 3341 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3342 PetscCall(PetscLogGpuTimeEnd()); 3343 PetscCallCUDA(cudaFree(buffer)); 3344 #else 3345 PetscCall(PetscLogGpuTimeBegin()); 3346 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3347 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3348 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3349 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get())); 3350 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3351 PetscCall(PetscLogGpuTimeEnd()); 3352 #endif 3353 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3354 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax)); 3355 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 3356 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3357 } else if (str == SAME_NONZERO_PATTERN) { 3358 cublasHandle_t cublasv2handle; 3359 PetscBLASInt one = 1, bnz = 1; 3360 3361 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax)); 3362 PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 3363 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3364 PetscCall(PetscBLASIntCast(x->nz,&bnz)); 3365 PetscCall(PetscLogGpuTimeBegin()); 3366 PetscCallCUBLAS(cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one)); 3367 PetscCall(PetscLogGpuFlops(2.0*bnz)); 3368 PetscCall(PetscLogGpuTimeEnd()); 3369 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax)); 3370 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 3371 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3372 } else { 3373 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE)); 3374 PetscCall(MatAXPY_SeqAIJ(Y,a,X,str)); 3375 } 3376 PetscFunctionReturn(0); 3377 } 3378 3379 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 3380 { 3381 Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 3382 PetscScalar *ay; 3383 cublasHandle_t cublasv2handle; 3384 PetscBLASInt one = 1, bnz = 1; 3385 3386 PetscFunctionBegin; 3387 PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 3388 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3389 PetscCall(PetscBLASIntCast(y->nz,&bnz)); 3390 PetscCall(PetscLogGpuTimeBegin()); 3391 PetscCallCUBLAS(cublasXscal(cublasv2handle,bnz,&a,ay,one)); 3392 PetscCall(PetscLogGpuFlops(bnz)); 3393 PetscCall(PetscLogGpuTimeEnd()); 3394 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 3395 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3396 PetscFunctionReturn(0); 3397 } 3398 3399 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3400 { 3401 PetscBool both = PETSC_FALSE; 3402 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3403 3404 PetscFunctionBegin; 3405 if (A->factortype == MAT_FACTOR_NONE) { 3406 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 3407 if (spptr->mat) { 3408 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 3409 if (matrix->values) { 3410 both = PETSC_TRUE; 3411 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3412 } 3413 } 3414 if (spptr->matTranspose) { 3415 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 3416 if (matrix->values) { 3417 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3418 } 3419 } 3420 } 3421 PetscCall(PetscArrayzero(a->a,a->i[A->rmap->n])); 3422 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 3423 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3424 else A->offloadmask = PETSC_OFFLOAD_CPU; 3425 PetscFunctionReturn(0); 3426 } 3427 3428 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3429 { 3430 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3431 3432 PetscFunctionBegin; 3433 if (A->factortype != MAT_FACTOR_NONE) { 3434 A->boundtocpu = flg; 3435 PetscFunctionReturn(0); 3436 } 3437 if (flg) { 3438 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 3439 3440 A->ops->scale = MatScale_SeqAIJ; 3441 A->ops->axpy = MatAXPY_SeqAIJ; 3442 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3443 A->ops->mult = MatMult_SeqAIJ; 3444 A->ops->multadd = MatMultAdd_SeqAIJ; 3445 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3446 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3447 A->ops->multhermitiantranspose = NULL; 3448 A->ops->multhermitiantransposeadd = NULL; 3449 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3450 PetscCall(PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps))); 3451 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL)); 3452 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL)); 3453 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL)); 3454 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 3455 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 3456 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL)); 3457 } else { 3458 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3459 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3460 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3461 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3462 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3463 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3464 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3465 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3466 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3467 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3468 a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 3469 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 3470 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 3471 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 3472 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 3473 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 3474 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE; 3475 3476 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE)); 3477 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 3478 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 3479 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE)); 3480 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE)); 3481 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 3482 } 3483 A->boundtocpu = flg; 3484 if (flg && a->inode.size) { 3485 a->inode.use = PETSC_TRUE; 3486 } else { 3487 a->inode.use = PETSC_FALSE; 3488 } 3489 PetscFunctionReturn(0); 3490 } 3491 3492 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3493 { 3494 Mat B; 3495 3496 PetscFunctionBegin; 3497 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */ 3498 if (reuse == MAT_INITIAL_MATRIX) { 3499 PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat)); 3500 } else if (reuse == MAT_REUSE_MATRIX) { 3501 PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN)); 3502 } 3503 B = *newmat; 3504 3505 PetscCall(PetscFree(B->defaultvectype)); 3506 PetscCall(PetscStrallocpy(VECCUDA,&B->defaultvectype)); 3507 3508 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3509 if (B->factortype == MAT_FACTOR_NONE) { 3510 Mat_SeqAIJCUSPARSE *spptr; 3511 PetscCall(PetscNew(&spptr)); 3512 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 3513 PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream)); 3514 spptr->format = MAT_CUSPARSE_CSR; 3515 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3516 #if CUSPARSE_VERSION > 11301 3517 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 3518 #else 3519 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3520 #endif 3521 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3522 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3523 #endif 3524 B->spptr = spptr; 3525 } else { 3526 Mat_SeqAIJCUSPARSETriFactors *spptr; 3527 3528 PetscCall(PetscNew(&spptr)); 3529 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 3530 PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream)); 3531 B->spptr = spptr; 3532 } 3533 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3534 } 3535 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3536 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3537 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 3538 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3539 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3540 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3541 3542 PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE)); 3543 PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE)); 3544 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE)); 3545 #if defined(PETSC_HAVE_HYPRE) 3546 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE)); 3547 #endif 3548 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE)); 3549 PetscFunctionReturn(0); 3550 } 3551 3552 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3553 { 3554 PetscFunctionBegin; 3555 PetscCall(MatCreate_SeqAIJ(B)); 3556 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B)); 3557 PetscFunctionReturn(0); 3558 } 3559 3560 /*MC 3561 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3562 3563 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3564 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3565 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3566 3567 Options Database Keys: 3568 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3569 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3570 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3571 + -mat_cusparse_use_cpu_solve - Do MatSolve on CPU 3572 3573 Level: beginner 3574 3575 .seealso: `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 3576 M*/ 3577 3578 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 3579 3580 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3581 { 3582 PetscFunctionBegin; 3583 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band)); 3584 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse)); 3585 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse)); 3586 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse)); 3587 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse)); 3588 3589 PetscFunctionReturn(0); 3590 } 3591 3592 static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat) 3593 { 3594 Mat_SeqAIJCUSPARSE* cusp = (Mat_SeqAIJCUSPARSE*)mat->spptr; 3595 3596 PetscFunctionBegin; 3597 if (!cusp) PetscFunctionReturn(0); 3598 delete cusp->cooPerm; 3599 delete cusp->cooPerm_a; 3600 cusp->cooPerm = NULL; 3601 cusp->cooPerm_a = NULL; 3602 if (cusp->use_extended_coo) { 3603 PetscCallCUDA(cudaFree(cusp->jmap_d)); 3604 PetscCallCUDA(cudaFree(cusp->perm_d)); 3605 } 3606 cusp->use_extended_coo = PETSC_FALSE; 3607 PetscFunctionReturn(0); 3608 } 3609 3610 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3611 { 3612 PetscFunctionBegin; 3613 if (*cusparsestruct) { 3614 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format)); 3615 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format)); 3616 delete (*cusparsestruct)->workVector; 3617 delete (*cusparsestruct)->rowoffsets_gpu; 3618 delete (*cusparsestruct)->cooPerm; 3619 delete (*cusparsestruct)->cooPerm_a; 3620 delete (*cusparsestruct)->csr2csc_i; 3621 if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle)); 3622 if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d)); 3623 if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d)); 3624 PetscCall(PetscFree(*cusparsestruct)); 3625 } 3626 PetscFunctionReturn(0); 3627 } 3628 3629 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3630 { 3631 PetscFunctionBegin; 3632 if (*mat) { 3633 delete (*mat)->values; 3634 delete (*mat)->column_indices; 3635 delete (*mat)->row_offsets; 3636 delete *mat; 3637 *mat = 0; 3638 } 3639 PetscFunctionReturn(0); 3640 } 3641 3642 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3643 { 3644 PetscFunctionBegin; 3645 if (*trifactor) { 3646 if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr)); 3647 if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo)); 3648 PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 3649 if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer)); 3650 if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h)); 3651 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3652 if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer)); 3653 #endif 3654 PetscCall(PetscFree(*trifactor)); 3655 } 3656 PetscFunctionReturn(0); 3657 } 3658 3659 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 3660 { 3661 CsrMatrix *mat; 3662 3663 PetscFunctionBegin; 3664 if (*matstruct) { 3665 if ((*matstruct)->mat) { 3666 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3667 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3668 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3669 #else 3670 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3671 PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat)); 3672 #endif 3673 } else { 3674 mat = (CsrMatrix*)(*matstruct)->mat; 3675 CsrMatrix_Destroy(&mat); 3676 } 3677 } 3678 if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr)); 3679 delete (*matstruct)->cprowIndices; 3680 if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one)); 3681 if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero)); 3682 if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one)); 3683 3684 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3685 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3686 if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr)); 3687 for (int i=0; i<3; i++) { 3688 if (mdata->cuSpMV[i].initialized) { 3689 PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer)); 3690 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr)); 3691 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr)); 3692 } 3693 } 3694 #endif 3695 delete *matstruct; 3696 *matstruct = NULL; 3697 } 3698 PetscFunctionReturn(0); 3699 } 3700 3701 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors) 3702 { 3703 PetscFunctionBegin; 3704 if (*trifactors) { 3705 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr)); 3706 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr)); 3707 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose)); 3708 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose)); 3709 delete (*trifactors)->rpermIndices; 3710 delete (*trifactors)->cpermIndices; 3711 delete (*trifactors)->workVector; 3712 (*trifactors)->rpermIndices = NULL; 3713 (*trifactors)->cpermIndices = NULL; 3714 (*trifactors)->workVector = NULL; 3715 if ((*trifactors)->a_band_d) PetscCallCUDA(cudaFree((*trifactors)->a_band_d)); 3716 if ((*trifactors)->i_band_d) PetscCallCUDA(cudaFree((*trifactors)->i_band_d)); 3717 (*trifactors)->init_dev_prop = PETSC_FALSE; 3718 } 3719 PetscFunctionReturn(0); 3720 } 3721 3722 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3723 { 3724 cusparseHandle_t handle; 3725 3726 PetscFunctionBegin; 3727 if (*trifactors) { 3728 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors)); 3729 if (handle = (*trifactors)->handle) { 3730 PetscCallCUSPARSE(cusparseDestroy(handle)); 3731 } 3732 PetscCall(PetscFree(*trifactors)); 3733 } 3734 PetscFunctionReturn(0); 3735 } 3736 3737 struct IJCompare 3738 { 3739 __host__ __device__ 3740 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3741 { 3742 if (t1.get<0>() < t2.get<0>()) return true; 3743 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3744 return false; 3745 } 3746 }; 3747 3748 struct IJEqual 3749 { 3750 __host__ __device__ 3751 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3752 { 3753 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3754 return true; 3755 } 3756 }; 3757 3758 struct IJDiff 3759 { 3760 __host__ __device__ 3761 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3762 { 3763 return t1 == t2 ? 0 : 1; 3764 } 3765 }; 3766 3767 struct IJSum 3768 { 3769 __host__ __device__ 3770 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3771 { 3772 return t1||t2; 3773 } 3774 }; 3775 3776 #include <thrust/iterator/discard_iterator.h> 3777 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */ 3778 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 3779 { 3780 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3781 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3782 THRUSTARRAY *cooPerm_v = NULL; 3783 thrust::device_ptr<const PetscScalar> d_v; 3784 CsrMatrix *matrix; 3785 PetscInt n; 3786 3787 PetscFunctionBegin; 3788 PetscCheck(cusp,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 3789 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 3790 if (!cusp->cooPerm) { 3791 PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY)); 3792 PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY)); 3793 PetscFunctionReturn(0); 3794 } 3795 matrix = (CsrMatrix*)cusp->mat->mat; 3796 PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3797 if (!v) { 3798 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3799 goto finalize; 3800 } 3801 n = cusp->cooPerm->size(); 3802 if (isCudaMem(v)) { 3803 d_v = thrust::device_pointer_cast(v); 3804 } else { 3805 cooPerm_v = new THRUSTARRAY(n); 3806 cooPerm_v->assign(v,v+n); 3807 d_v = cooPerm_v->data(); 3808 PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar))); 3809 } 3810 PetscCall(PetscLogGpuTimeBegin()); 3811 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3812 if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 3813 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 3814 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3815 /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 3816 cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 3817 cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 3818 */ 3819 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3820 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3821 delete cooPerm_w; 3822 } else { 3823 /* all nonzeros in d_v[] are unique entries */ 3824 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3825 matrix->values->begin())); 3826 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3827 matrix->values->end())); 3828 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 3829 } 3830 } else { 3831 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 3832 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3833 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3834 } else { 3835 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3836 matrix->values->begin())); 3837 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3838 matrix->values->end())); 3839 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3840 } 3841 } 3842 PetscCall(PetscLogGpuTimeEnd()); 3843 finalize: 3844 delete cooPerm_v; 3845 A->offloadmask = PETSC_OFFLOAD_GPU; 3846 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 3847 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3848 PetscCall(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz)); 3849 PetscCall(PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n")); 3850 PetscCall(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax)); 3851 a->reallocs = 0; 3852 A->info.mallocs += 0; 3853 A->info.nz_unneeded = 0; 3854 A->assembled = A->was_assembled = PETSC_TRUE; 3855 A->num_ass++; 3856 PetscFunctionReturn(0); 3857 } 3858 3859 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3860 { 3861 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3862 3863 PetscFunctionBegin; 3864 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3865 if (!cusp) PetscFunctionReturn(0); 3866 if (destroy) { 3867 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format)); 3868 delete cusp->csr2csc_i; 3869 cusp->csr2csc_i = NULL; 3870 } 3871 A->transupdated = PETSC_FALSE; 3872 PetscFunctionReturn(0); 3873 } 3874 3875 #include <thrust/binary_search.h> 3876 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */ 3877 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[]) 3878 { 3879 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3880 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3881 PetscInt cooPerm_n, nzr = 0; 3882 3883 PetscFunctionBegin; 3884 PetscCall(PetscLayoutSetUp(A->rmap)); 3885 PetscCall(PetscLayoutSetUp(A->cmap)); 3886 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 3887 if (n != cooPerm_n) { 3888 delete cusp->cooPerm; 3889 delete cusp->cooPerm_a; 3890 cusp->cooPerm = NULL; 3891 cusp->cooPerm_a = NULL; 3892 } 3893 if (n) { 3894 THRUSTINTARRAY d_i(n); 3895 THRUSTINTARRAY d_j(n); 3896 THRUSTINTARRAY ii(A->rmap->n); 3897 3898 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 3899 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 3900 3901 PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt))); 3902 d_i.assign(coo_i,coo_i+n); 3903 d_j.assign(coo_j,coo_j+n); 3904 3905 /* Ex. 3906 n = 6 3907 coo_i = [3,3,1,4,1,4] 3908 coo_j = [3,2,2,5,2,6] 3909 */ 3910 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 3911 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 3912 3913 PetscCall(PetscLogGpuTimeBegin()); 3914 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3915 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 3916 *cusp->cooPerm_a = d_i; /* copy the sorted array */ 3917 THRUSTINTARRAY w = d_j; 3918 3919 /* 3920 d_i = [1,1,3,3,4,4] 3921 d_j = [2,2,2,3,5,6] 3922 cooPerm = [2,4,1,0,3,5] 3923 */ 3924 auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 3925 3926 /* 3927 d_i = [1,3,3,4,4,x] 3928 ^ekey 3929 d_j = [2,2,3,5,6,x] 3930 ^nekye 3931 */ 3932 if (nekey == ekey) { /* all entries are unique */ 3933 delete cusp->cooPerm_a; 3934 cusp->cooPerm_a = NULL; 3935 } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 3936 /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 3937 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 3938 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 3939 (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 3940 w[0] = 0; 3941 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 3942 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 3943 } 3944 thrust::counting_iterator<PetscInt> search_begin(0); 3945 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 3946 search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 3947 ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 3948 PetscCall(PetscLogGpuTimeEnd()); 3949 3950 PetscCall(MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i)); 3951 a->singlemalloc = PETSC_FALSE; 3952 a->free_a = PETSC_TRUE; 3953 a->free_ij = PETSC_TRUE; 3954 PetscCall(PetscMalloc1(A->rmap->n+1,&a->i)); 3955 a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 3956 PetscCallCUDA(cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 3957 a->nz = a->maxnz = a->i[A->rmap->n]; 3958 a->rmax = 0; 3959 PetscCall(PetscMalloc1(a->nz,&a->a)); 3960 PetscCall(PetscMalloc1(a->nz,&a->j)); 3961 PetscCallCUDA(cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 3962 if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n,&a->ilen)); 3963 if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n,&a->imax)); 3964 for (PetscInt i = 0; i < A->rmap->n; i++) { 3965 const PetscInt nnzr = a->i[i+1] - a->i[i]; 3966 nzr += (PetscInt)!!(nnzr); 3967 a->ilen[i] = a->imax[i] = nnzr; 3968 a->rmax = PetscMax(a->rmax,nnzr); 3969 } 3970 a->nonzerorowcnt = nzr; 3971 A->preallocated = PETSC_TRUE; 3972 PetscCall(PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt))); 3973 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 3974 } else { 3975 PetscCall(MatSeqAIJSetPreallocation(A,0,NULL)); 3976 } 3977 PetscCall(MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE)); 3978 3979 /* We want to allocate the CUSPARSE struct for matvec now. 3980 The code is so convoluted now that I prefer to copy zeros */ 3981 PetscCall(PetscArrayzero(a->a,a->nz)); 3982 PetscCall(MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6)); 3983 A->offloadmask = PETSC_OFFLOAD_CPU; 3984 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3985 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 3986 PetscFunctionReturn(0); 3987 } 3988 3989 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 3990 { 3991 Mat_SeqAIJ *seq; 3992 Mat_SeqAIJCUSPARSE *dev; 3993 PetscBool coo_basic = PETSC_TRUE; 3994 PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 3995 3996 PetscFunctionBegin; 3997 PetscCall(MatResetPreallocationCOO_SeqAIJ(mat)); 3998 PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat)); 3999 if (coo_i) { 4000 PetscCall(PetscGetMemType(coo_i,&mtype)); 4001 if (PetscMemTypeHost(mtype)) { 4002 for (PetscCount k=0; k<coo_n; k++) { 4003 if (coo_i[k] < 0 || coo_j[k] < 0) {coo_basic = PETSC_FALSE; break;} 4004 } 4005 } 4006 } 4007 4008 if (coo_basic) { /* i,j are on device or do not contain negative indices */ 4009 PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j)); 4010 } else { 4011 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j)); 4012 mat->offloadmask = PETSC_OFFLOAD_CPU; 4013 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat)); 4014 seq = static_cast<Mat_SeqAIJ*>(mat->data); 4015 dev = static_cast<Mat_SeqAIJCUSPARSE*>(mat->spptr); 4016 PetscCallCUDA(cudaMalloc((void**)&dev->jmap_d,(seq->nz+1)*sizeof(PetscCount))); 4017 PetscCallCUDA(cudaMemcpy(dev->jmap_d,seq->jmap,(seq->nz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice)); 4018 PetscCallCUDA(cudaMalloc((void**)&dev->perm_d,seq->Atot*sizeof(PetscCount))); 4019 PetscCallCUDA(cudaMemcpy(dev->perm_d,seq->perm,seq->Atot*sizeof(PetscCount),cudaMemcpyHostToDevice)); 4020 dev->use_extended_coo = PETSC_TRUE; 4021 } 4022 PetscFunctionReturn(0); 4023 } 4024 4025 __global__ static void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount jmap[],const PetscCount perm[],InsertMode imode,PetscScalar a[]) 4026 { 4027 PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 4028 const PetscCount grid_size = gridDim.x * blockDim.x; 4029 for (; i<nnz; i+= grid_size) { 4030 PetscScalar sum = 0.0; 4031 for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) sum += kv[perm[k]]; 4032 a[i] = (imode == INSERT_VALUES? 0.0 : a[i]) + sum; 4033 } 4034 } 4035 4036 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 4037 { 4038 Mat_SeqAIJ *seq = (Mat_SeqAIJ*)A->data; 4039 Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE*)A->spptr; 4040 PetscCount Annz = seq->nz; 4041 PetscMemType memtype; 4042 const PetscScalar *v1 = v; 4043 PetscScalar *Aa; 4044 4045 PetscFunctionBegin; 4046 if (dev->use_extended_coo) { 4047 PetscCall(PetscGetMemType(v,&memtype)); 4048 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 4049 PetscCallCUDA(cudaMalloc((void**)&v1,seq->coo_n*sizeof(PetscScalar))); 4050 PetscCallCUDA(cudaMemcpy((void*)v1,v,seq->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4051 } 4052 4053 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa)); 4054 else PetscCall(MatSeqAIJCUSPARSEGetArray(A,&Aa)); 4055 4056 if (Annz) { 4057 MatAddCOOValues<<<(Annz+255)/256,256>>>(v1,Annz,dev->jmap_d,dev->perm_d,imode,Aa); 4058 PetscCallCUDA(cudaPeekAtLastError()); 4059 } 4060 4061 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa)); 4062 else PetscCall(MatSeqAIJCUSPARSERestoreArray(A,&Aa)); 4063 4064 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void*)v1)); 4065 } else { 4066 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A,v,imode)); 4067 } 4068 PetscFunctionReturn(0); 4069 } 4070 4071 /*@C 4072 MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices. 4073 4074 Not collective 4075 4076 Input Parameters: 4077 + A - the matrix 4078 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4079 4080 Output Parameters: 4081 + ia - the CSR row pointers 4082 - ja - the CSR column indices 4083 4084 Level: developer 4085 4086 Notes: 4087 When compressed is true, the CSR structure does not contain empty rows 4088 4089 .seealso: `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()` 4090 @*/ 4091 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j) 4092 { 4093 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4094 CsrMatrix *csr; 4095 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4096 4097 PetscFunctionBegin; 4098 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4099 if (!i || !j) PetscFunctionReturn(0); 4100 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4101 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4102 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4103 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4104 csr = (CsrMatrix*)cusp->mat->mat; 4105 if (i) { 4106 if (!compressed && a->compressedrow.use) { /* need full row offset */ 4107 if (!cusp->rowoffsets_gpu) { 4108 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4109 cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4110 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 4111 } 4112 *i = cusp->rowoffsets_gpu->data().get(); 4113 } else *i = csr->row_offsets->data().get(); 4114 } 4115 if (j) *j = csr->column_indices->data().get(); 4116 PetscFunctionReturn(0); 4117 } 4118 4119 /*@C 4120 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ() 4121 4122 Not collective 4123 4124 Input Parameters: 4125 + A - the matrix 4126 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4127 4128 Output Parameters: 4129 + ia - the CSR row pointers 4130 - ja - the CSR column indices 4131 4132 Level: developer 4133 4134 .seealso: `MatSeqAIJCUSPARSEGetIJ()` 4135 @*/ 4136 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j) 4137 { 4138 PetscFunctionBegin; 4139 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4140 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4141 if (i) *i = NULL; 4142 if (j) *j = NULL; 4143 PetscFunctionReturn(0); 4144 } 4145 4146 /*@C 4147 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4148 4149 Not Collective 4150 4151 Input Parameter: 4152 . A - a MATSEQAIJCUSPARSE matrix 4153 4154 Output Parameter: 4155 . a - pointer to the device data 4156 4157 Level: developer 4158 4159 Notes: may trigger host-device copies if up-to-date matrix data is on host 4160 4161 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()` 4162 @*/ 4163 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 4164 { 4165 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4166 CsrMatrix *csr; 4167 4168 PetscFunctionBegin; 4169 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4170 PetscValidPointer(a,2); 4171 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4172 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4173 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4174 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4175 csr = (CsrMatrix*)cusp->mat->mat; 4176 PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4177 *a = csr->values->data().get(); 4178 PetscFunctionReturn(0); 4179 } 4180 4181 /*@C 4182 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead() 4183 4184 Not Collective 4185 4186 Input Parameter: 4187 . A - a MATSEQAIJCUSPARSE matrix 4188 4189 Output Parameter: 4190 . a - pointer to the device data 4191 4192 Level: developer 4193 4194 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()` 4195 @*/ 4196 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 4197 { 4198 PetscFunctionBegin; 4199 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4200 PetscValidPointer(a,2); 4201 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4202 *a = NULL; 4203 PetscFunctionReturn(0); 4204 } 4205 4206 /*@C 4207 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4208 4209 Not Collective 4210 4211 Input Parameter: 4212 . A - a MATSEQAIJCUSPARSE matrix 4213 4214 Output Parameter: 4215 . a - pointer to the device data 4216 4217 Level: developer 4218 4219 Notes: may trigger host-device copies if up-to-date matrix data is on host 4220 4221 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()` 4222 @*/ 4223 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 4224 { 4225 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4226 CsrMatrix *csr; 4227 4228 PetscFunctionBegin; 4229 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4230 PetscValidPointer(a,2); 4231 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4232 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4233 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4234 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4235 csr = (CsrMatrix*)cusp->mat->mat; 4236 PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4237 *a = csr->values->data().get(); 4238 A->offloadmask = PETSC_OFFLOAD_GPU; 4239 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 4240 PetscFunctionReturn(0); 4241 } 4242 /*@C 4243 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray() 4244 4245 Not Collective 4246 4247 Input Parameter: 4248 . A - a MATSEQAIJCUSPARSE matrix 4249 4250 Output Parameter: 4251 . a - pointer to the device data 4252 4253 Level: developer 4254 4255 .seealso: `MatSeqAIJCUSPARSEGetArray()` 4256 @*/ 4257 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 4258 { 4259 PetscFunctionBegin; 4260 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4261 PetscValidPointer(a,2); 4262 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4263 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4264 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4265 *a = NULL; 4266 PetscFunctionReturn(0); 4267 } 4268 4269 /*@C 4270 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4271 4272 Not Collective 4273 4274 Input Parameter: 4275 . A - a MATSEQAIJCUSPARSE matrix 4276 4277 Output Parameter: 4278 . a - pointer to the device data 4279 4280 Level: developer 4281 4282 Notes: does not trigger host-device copies and flags data validity on the GPU 4283 4284 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()` 4285 @*/ 4286 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 4287 { 4288 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4289 CsrMatrix *csr; 4290 4291 PetscFunctionBegin; 4292 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4293 PetscValidPointer(a,2); 4294 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4295 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4296 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4297 csr = (CsrMatrix*)cusp->mat->mat; 4298 PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4299 *a = csr->values->data().get(); 4300 A->offloadmask = PETSC_OFFLOAD_GPU; 4301 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 4302 PetscFunctionReturn(0); 4303 } 4304 4305 /*@C 4306 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite() 4307 4308 Not Collective 4309 4310 Input Parameter: 4311 . A - a MATSEQAIJCUSPARSE matrix 4312 4313 Output Parameter: 4314 . a - pointer to the device data 4315 4316 Level: developer 4317 4318 .seealso: `MatSeqAIJCUSPARSEGetArrayWrite()` 4319 @*/ 4320 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 4321 { 4322 PetscFunctionBegin; 4323 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4324 PetscValidPointer(a,2); 4325 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4326 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4327 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4328 *a = NULL; 4329 PetscFunctionReturn(0); 4330 } 4331 4332 struct IJCompare4 4333 { 4334 __host__ __device__ 4335 inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4336 { 4337 if (t1.get<0>() < t2.get<0>()) return true; 4338 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4339 return false; 4340 } 4341 }; 4342 4343 struct Shift 4344 { 4345 int _shift; 4346 4347 Shift(int shift) : _shift(shift) {} 4348 __host__ __device__ 4349 inline int operator() (const int &c) 4350 { 4351 return c + _shift; 4352 } 4353 }; 4354 4355 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 4356 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 4357 { 4358 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 4359 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 4360 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4361 CsrMatrix *Acsr,*Bcsr,*Ccsr; 4362 PetscInt Annz,Bnnz; 4363 cusparseStatus_t stat; 4364 PetscInt i,m,n,zero = 0; 4365 4366 PetscFunctionBegin; 4367 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4368 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 4369 PetscValidPointer(C,4); 4370 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4371 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 4372 PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 4373 PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 4374 PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4375 PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4376 if (reuse == MAT_INITIAL_MATRIX) { 4377 m = A->rmap->n; 4378 n = A->cmap->n + B->cmap->n; 4379 PetscCall(MatCreate(PETSC_COMM_SELF,C)); 4380 PetscCall(MatSetSizes(*C,m,n,m,n)); 4381 PetscCall(MatSetType(*C,MATSEQAIJCUSPARSE)); 4382 c = (Mat_SeqAIJ*)(*C)->data; 4383 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4384 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4385 Ccsr = new CsrMatrix; 4386 Cmat->cprowIndices = NULL; 4387 c->compressedrow.use = PETSC_FALSE; 4388 c->compressedrow.nrows = 0; 4389 c->compressedrow.i = NULL; 4390 c->compressedrow.rindex = NULL; 4391 Ccusp->workVector = NULL; 4392 Ccusp->nrows = m; 4393 Ccusp->mat = Cmat; 4394 Ccusp->mat->mat = Ccsr; 4395 Ccsr->num_rows = m; 4396 Ccsr->num_cols = n; 4397 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 4398 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 4399 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4400 PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar))); 4401 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar))); 4402 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 4403 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4404 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4405 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4406 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4407 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4408 PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4409 PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4410 4411 Acsr = (CsrMatrix*)Acusp->mat->mat; 4412 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4413 Annz = (PetscInt)Acsr->column_indices->size(); 4414 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4415 c->nz = Annz + Bnnz; 4416 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 4417 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4418 Ccsr->values = new THRUSTARRAY(c->nz); 4419 Ccsr->num_entries = c->nz; 4420 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4421 if (c->nz) { 4422 auto Acoo = new THRUSTINTARRAY32(Annz); 4423 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4424 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4425 THRUSTINTARRAY32 *Aroff,*Broff; 4426 4427 if (a->compressedrow.use) { /* need full row offset */ 4428 if (!Acusp->rowoffsets_gpu) { 4429 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4430 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4431 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 4432 } 4433 Aroff = Acusp->rowoffsets_gpu; 4434 } else Aroff = Acsr->row_offsets; 4435 if (b->compressedrow.use) { /* need full row offset */ 4436 if (!Bcusp->rowoffsets_gpu) { 4437 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4438 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 4439 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt))); 4440 } 4441 Broff = Bcusp->rowoffsets_gpu; 4442 } else Broff = Bcsr->row_offsets; 4443 PetscCall(PetscLogGpuTimeBegin()); 4444 stat = cusparseXcsr2coo(Acusp->handle, 4445 Aroff->data().get(), 4446 Annz, 4447 m, 4448 Acoo->data().get(), 4449 CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat); 4450 stat = cusparseXcsr2coo(Bcusp->handle, 4451 Broff->data().get(), 4452 Bnnz, 4453 m, 4454 Bcoo->data().get(), 4455 CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat); 4456 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4457 auto Aperm = thrust::make_constant_iterator(1); 4458 auto Bperm = thrust::make_constant_iterator(0); 4459 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 4460 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 4461 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 4462 #else 4463 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4464 auto Bcib = Bcsr->column_indices->begin(); 4465 auto Bcie = Bcsr->column_indices->end(); 4466 thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 4467 #endif 4468 auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 4469 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 4470 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 4471 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 4472 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 4473 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 4474 auto p1 = Ccusp->cooPerm->begin(); 4475 auto p2 = Ccusp->cooPerm->begin(); 4476 thrust::advance(p2,Annz); 4477 PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 4478 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 4479 thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 4480 #endif 4481 auto cci = thrust::make_counting_iterator(zero); 4482 auto cce = thrust::make_counting_iterator(c->nz); 4483 #if 0 //Errors on SUMMIT cuda 11.1.0 4484 PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4485 #else 4486 auto pred = thrust::identity<int>(); 4487 PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 4488 PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 4489 #endif 4490 stat = cusparseXcoo2csr(Ccusp->handle, 4491 Ccoo->data().get(), 4492 c->nz, 4493 m, 4494 Ccsr->row_offsets->data().get(), 4495 CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat); 4496 PetscCall(PetscLogGpuTimeEnd()); 4497 delete wPerm; 4498 delete Acoo; 4499 delete Bcoo; 4500 delete Ccoo; 4501 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4502 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4503 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4504 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4505 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 4506 #endif 4507 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4508 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 4509 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 4510 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4511 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4512 CsrMatrix *CcsrT = new CsrMatrix; 4513 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4514 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4515 4516 (*C)->form_explicit_transpose = PETSC_TRUE; 4517 (*C)->transupdated = PETSC_TRUE; 4518 Ccusp->rowoffsets_gpu = NULL; 4519 CmatT->cprowIndices = NULL; 4520 CmatT->mat = CcsrT; 4521 CcsrT->num_rows = n; 4522 CcsrT->num_cols = m; 4523 CcsrT->num_entries = c->nz; 4524 4525 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4526 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4527 CcsrT->values = new THRUSTARRAY(c->nz); 4528 4529 PetscCall(PetscLogGpuTimeBegin()); 4530 auto rT = CcsrT->row_offsets->begin(); 4531 if (AT) { 4532 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4533 thrust::advance(rT,-1); 4534 } 4535 if (BT) { 4536 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4537 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4538 thrust::copy(titb,tite,rT); 4539 } 4540 auto cT = CcsrT->column_indices->begin(); 4541 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4542 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4543 auto vT = CcsrT->values->begin(); 4544 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4545 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4546 PetscCall(PetscLogGpuTimeEnd()); 4547 4548 PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr)); 4549 PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO)); 4550 PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4551 PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar))); 4552 PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar))); 4553 PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar))); 4554 PetscCallCUDA(cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4555 PetscCallCUDA(cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4556 PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4557 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4558 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4559 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4560 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4561 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 4562 #endif 4563 Ccusp->matTranspose = CmatT; 4564 } 4565 } 4566 4567 c->singlemalloc = PETSC_FALSE; 4568 c->free_a = PETSC_TRUE; 4569 c->free_ij = PETSC_TRUE; 4570 PetscCall(PetscMalloc1(m+1,&c->i)); 4571 PetscCall(PetscMalloc1(c->nz,&c->j)); 4572 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4573 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4574 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4575 ii = *Ccsr->row_offsets; 4576 jj = *Ccsr->column_indices; 4577 PetscCallCUDA(cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 4578 PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 4579 } else { 4580 PetscCallCUDA(cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 4581 PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 4582 } 4583 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt))); 4584 PetscCall(PetscMalloc1(m,&c->ilen)); 4585 PetscCall(PetscMalloc1(m,&c->imax)); 4586 c->maxnz = c->nz; 4587 c->nonzerorowcnt = 0; 4588 c->rmax = 0; 4589 for (i = 0; i < m; i++) { 4590 const PetscInt nn = c->i[i+1] - c->i[i]; 4591 c->ilen[i] = c->imax[i] = nn; 4592 c->nonzerorowcnt += (PetscInt)!!nn; 4593 c->rmax = PetscMax(c->rmax,nn); 4594 } 4595 PetscCall(MatMarkDiagonal_SeqAIJ(*C)); 4596 PetscCall(PetscMalloc1(c->nz,&c->a)); 4597 (*C)->nonzerostate++; 4598 PetscCall(PetscLayoutSetUp((*C)->rmap)); 4599 PetscCall(PetscLayoutSetUp((*C)->cmap)); 4600 Ccusp->nonzerostate = (*C)->nonzerostate; 4601 (*C)->preallocated = PETSC_TRUE; 4602 } else { 4603 PetscCheck((*C)->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n); 4604 c = (Mat_SeqAIJ*)(*C)->data; 4605 if (c->nz) { 4606 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4607 PetscCheck(Ccusp->cooPerm,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4608 PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4609 PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate,PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4610 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4611 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4612 PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4613 PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4614 Acsr = (CsrMatrix*)Acusp->mat->mat; 4615 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4616 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4617 PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size()); 4618 PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4619 PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4620 PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries,PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4621 PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4622 auto pmid = Ccusp->cooPerm->begin(); 4623 thrust::advance(pmid,Acsr->num_entries); 4624 PetscCall(PetscLogGpuTimeBegin()); 4625 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4626 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4627 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4628 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4629 thrust::for_each(zibait,zieait,VecCUDAEquals()); 4630 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4631 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4632 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4633 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4634 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4635 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE)); 4636 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4637 PetscCheck(Ccusp->matTranspose,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4638 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4639 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4640 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4641 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4642 auto vT = CcsrT->values->begin(); 4643 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4644 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4645 (*C)->transupdated = PETSC_TRUE; 4646 } 4647 PetscCall(PetscLogGpuTimeEnd()); 4648 } 4649 } 4650 PetscCall(PetscObjectStateIncrease((PetscObject)*C)); 4651 (*C)->assembled = PETSC_TRUE; 4652 (*C)->was_assembled = PETSC_FALSE; 4653 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4654 PetscFunctionReturn(0); 4655 } 4656 4657 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4658 { 4659 bool dmem; 4660 const PetscScalar *av; 4661 4662 PetscFunctionBegin; 4663 dmem = isCudaMem(v); 4664 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A,&av)); 4665 if (n && idx) { 4666 THRUSTINTARRAY widx(n); 4667 widx.assign(idx,idx+n); 4668 PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt))); 4669 4670 THRUSTARRAY *w = NULL; 4671 thrust::device_ptr<PetscScalar> dv; 4672 if (dmem) { 4673 dv = thrust::device_pointer_cast(v); 4674 } else { 4675 w = new THRUSTARRAY(n); 4676 dv = w->data(); 4677 } 4678 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4679 4680 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4681 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4682 thrust::for_each(zibit,zieit,VecCUDAEquals()); 4683 if (w) { 4684 PetscCallCUDA(cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost)); 4685 } 4686 delete w; 4687 } else { 4688 PetscCallCUDA(cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost)); 4689 } 4690 if (!dmem) PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar))); 4691 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A,&av)); 4692 PetscFunctionReturn(0); 4693 } 4694