1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_CXX_COMPLEX_FIX 7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 8 9 #include <petscconf.h> 10 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11 #include <../src/mat/impls/sbaij/seq/sbaij.h> 12 #include <../src/vec/vec/impls/dvecimpl.h> 13 #include <petsc/private/vecimpl.h> 14 #undef VecType 15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16 #include <thrust/async/for_each.h> 17 18 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 19 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 20 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 21 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 22 23 typedef enum { 24 CUSPARSE_MV_ALG_DEFAULT = 0, 25 CUSPARSE_COOMV_ALG = 1, 26 CUSPARSE_CSRMV_ALG1 = 2, 27 CUSPARSE_CSRMV_ALG2 = 3 28 } cusparseSpMVAlg_t; 29 30 typedef enum { 31 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 32 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 33 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 34 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 35 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 36 CUSPARSE_SPMM_ALG_DEFAULT = 0, 37 CUSPARSE_SPMM_COO_ALG1 = 1, 38 CUSPARSE_SPMM_COO_ALG2 = 2, 39 CUSPARSE_SPMM_COO_ALG3 = 3, 40 CUSPARSE_SPMM_COO_ALG4 = 5, 41 CUSPARSE_SPMM_CSR_ALG1 = 4, 42 CUSPARSE_SPMM_CSR_ALG2 = 6, 43 } cusparseSpMMAlg_t; 44 45 typedef enum { 46 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 47 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 48 } cusparseCsr2CscAlg_t; 49 */ 50 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 51 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 52 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 53 #endif 54 55 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 57 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 58 59 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 60 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 61 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 62 63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 64 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 66 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 67 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 68 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 69 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 70 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 71 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 72 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 73 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 74 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 75 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 76 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 77 78 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 79 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 80 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 81 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**); 82 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 83 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 84 85 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 86 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 87 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 88 89 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 90 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 91 92 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 93 94 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 95 { 96 cusparseStatus_t stat; 97 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 98 99 PetscFunctionBegin; 100 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 101 cusparsestruct->stream = stream; 102 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 103 PetscFunctionReturn(0); 104 } 105 106 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 107 { 108 cusparseStatus_t stat; 109 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 110 111 PetscFunctionBegin; 112 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 113 if (cusparsestruct->handle != handle) { 114 if (cusparsestruct->handle) { 115 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 116 } 117 cusparsestruct->handle = handle; 118 } 119 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 120 PetscFunctionReturn(0); 121 } 122 123 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 124 { 125 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 126 PetscBool flg; 127 PetscErrorCode ierr; 128 129 PetscFunctionBegin; 130 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 131 if (!flg || !cusparsestruct) PetscFunctionReturn(0); 132 if (cusparsestruct->handle) cusparsestruct->handle = 0; 133 PetscFunctionReturn(0); 134 } 135 136 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 137 { 138 PetscFunctionBegin; 139 *type = MATSOLVERCUSPARSE; 140 PetscFunctionReturn(0); 141 } 142 143 /*MC 144 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 145 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 146 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 147 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 148 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 149 algorithms are not recommended. This class does NOT support direct solver operations. 150 151 Level: beginner 152 153 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 154 M*/ 155 156 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 157 { 158 PetscErrorCode ierr; 159 PetscInt n = A->rmap->n; 160 161 PetscFunctionBegin; 162 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 163 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 164 (*B)->factortype = ftype; 165 (*B)->useordering = PETSC_TRUE; 166 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 167 168 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 169 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 170 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 171 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 172 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 173 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 174 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 175 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 176 177 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 178 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 179 PetscFunctionReturn(0); 180 } 181 182 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 183 { 184 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 185 186 PetscFunctionBegin; 187 switch (op) { 188 case MAT_CUSPARSE_MULT: 189 cusparsestruct->format = format; 190 break; 191 case MAT_CUSPARSE_ALL: 192 cusparsestruct->format = format; 193 break; 194 default: 195 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 196 } 197 PetscFunctionReturn(0); 198 } 199 200 /*@ 201 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 202 operation. Only the MatMult operation can use different GPU storage formats 203 for MPIAIJCUSPARSE matrices. 204 Not Collective 205 206 Input Parameters: 207 + A - Matrix of type SEQAIJCUSPARSE 208 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 209 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 210 211 Output Parameter: 212 213 Level: intermediate 214 215 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 216 @*/ 217 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 218 { 219 PetscErrorCode ierr; 220 221 PetscFunctionBegin; 222 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 223 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 224 PetscFunctionReturn(0); 225 } 226 227 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 228 { 229 PetscErrorCode ierr; 230 231 PetscFunctionBegin; 232 switch (op) { 233 case MAT_FORM_EXPLICIT_TRANSPOSE: 234 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 235 if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);} 236 A->form_explicit_transpose = flg; 237 break; 238 default: 239 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 240 break; 241 } 242 PetscFunctionReturn(0); 243 } 244 245 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 246 { 247 PetscErrorCode ierr; 248 MatCUSPARSEStorageFormat format; 249 PetscBool flg; 250 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 251 252 PetscFunctionBegin; 253 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 254 if (A->factortype == MAT_FACTOR_NONE) { 255 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 256 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 257 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 258 259 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 260 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 261 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 262 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 263 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 264 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 265 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 266 if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 267 268 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 269 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 270 if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 271 272 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 273 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 274 if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 275 #endif 276 } 277 ierr = PetscOptionsTail();CHKERRQ(ierr); 278 PetscFunctionReturn(0); 279 } 280 281 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 282 { 283 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 284 PetscErrorCode ierr; 285 286 PetscFunctionBegin; 287 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 288 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 289 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 290 PetscFunctionReturn(0); 291 } 292 293 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 294 { 295 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 296 PetscErrorCode ierr; 297 298 PetscFunctionBegin; 299 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 300 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 301 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 302 PetscFunctionReturn(0); 303 } 304 305 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 306 { 307 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 308 PetscErrorCode ierr; 309 310 PetscFunctionBegin; 311 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 312 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 313 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 314 PetscFunctionReturn(0); 315 } 316 317 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 318 { 319 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 320 PetscErrorCode ierr; 321 322 PetscFunctionBegin; 323 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 324 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 325 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 326 PetscFunctionReturn(0); 327 } 328 329 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 330 { 331 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 332 PetscInt n = A->rmap->n; 333 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 334 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 335 cusparseStatus_t stat; 336 const PetscInt *ai = a->i,*aj = a->j,*vi; 337 const MatScalar *aa = a->a,*v; 338 PetscInt *AiLo, *AjLo; 339 PetscInt i,nz, nzLower, offset, rowOffset; 340 PetscErrorCode ierr; 341 cudaError_t cerr; 342 343 PetscFunctionBegin; 344 if (!n) PetscFunctionReturn(0); 345 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 346 try { 347 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 348 nzLower=n+ai[n]-ai[1]; 349 if (!loTriFactor) { 350 PetscScalar *AALo; 351 352 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 353 354 /* Allocate Space for the lower triangular matrix */ 355 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 356 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 357 358 /* Fill the lower triangular matrix */ 359 AiLo[0] = (PetscInt) 0; 360 AiLo[n] = nzLower; 361 AjLo[0] = (PetscInt) 0; 362 AALo[0] = (MatScalar) 1.0; 363 v = aa; 364 vi = aj; 365 offset = 1; 366 rowOffset= 1; 367 for (i=1; i<n; i++) { 368 nz = ai[i+1] - ai[i]; 369 /* additional 1 for the term on the diagonal */ 370 AiLo[i] = rowOffset; 371 rowOffset += nz+1; 372 373 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 374 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 375 376 offset += nz; 377 AjLo[offset] = (PetscInt) i; 378 AALo[offset] = (MatScalar) 1.0; 379 offset += 1; 380 381 v += nz; 382 vi += nz; 383 } 384 385 /* allocate space for the triangular factor information */ 386 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 387 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 388 /* Create the matrix description */ 389 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 390 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 391 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 392 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 393 #else 394 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 395 #endif 396 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 397 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 398 399 /* set the operation */ 400 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 401 402 /* set the matrix */ 403 loTriFactor->csrMat = new CsrMatrix; 404 loTriFactor->csrMat->num_rows = n; 405 loTriFactor->csrMat->num_cols = n; 406 loTriFactor->csrMat->num_entries = nzLower; 407 408 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 409 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 410 411 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 412 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 413 414 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 415 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 416 417 /* Create the solve analysis information */ 418 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 419 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 420 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 421 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 422 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 423 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 424 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 425 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 426 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 427 #endif 428 429 /* perform the solve analysis */ 430 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 431 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 432 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 433 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 434 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 435 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 436 #endif 437 );CHKERRCUSPARSE(stat); 438 cerr = WaitForCUDA();CHKERRCUDA(cerr); 439 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 440 441 /* assign the pointer */ 442 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 443 loTriFactor->AA_h = AALo; 444 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 445 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 446 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 447 } else { /* update values only */ 448 if (!loTriFactor->AA_h) { 449 cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 450 } 451 /* Fill the lower triangular matrix */ 452 loTriFactor->AA_h[0] = 1.0; 453 v = aa; 454 vi = aj; 455 offset = 1; 456 for (i=1; i<n; i++) { 457 nz = ai[i+1] - ai[i]; 458 ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 459 offset += nz; 460 loTriFactor->AA_h[offset] = 1.0; 461 offset += 1; 462 v += nz; 463 } 464 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 465 ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 466 } 467 } catch(char *ex) { 468 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 469 } 470 } 471 PetscFunctionReturn(0); 472 } 473 474 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 475 { 476 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 477 PetscInt n = A->rmap->n; 478 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 479 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 480 cusparseStatus_t stat; 481 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 482 const MatScalar *aa = a->a,*v; 483 PetscInt *AiUp, *AjUp; 484 PetscInt i,nz, nzUpper, offset; 485 PetscErrorCode ierr; 486 cudaError_t cerr; 487 488 PetscFunctionBegin; 489 if (!n) PetscFunctionReturn(0); 490 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 491 try { 492 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 493 nzUpper = adiag[0]-adiag[n]; 494 if (!upTriFactor) { 495 PetscScalar *AAUp; 496 497 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 498 499 /* Allocate Space for the upper triangular matrix */ 500 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 501 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 502 503 /* Fill the upper triangular matrix */ 504 AiUp[0]=(PetscInt) 0; 505 AiUp[n]=nzUpper; 506 offset = nzUpper; 507 for (i=n-1; i>=0; i--) { 508 v = aa + adiag[i+1] + 1; 509 vi = aj + adiag[i+1] + 1; 510 511 /* number of elements NOT on the diagonal */ 512 nz = adiag[i] - adiag[i+1]-1; 513 514 /* decrement the offset */ 515 offset -= (nz+1); 516 517 /* first, set the diagonal elements */ 518 AjUp[offset] = (PetscInt) i; 519 AAUp[offset] = (MatScalar)1./v[nz]; 520 AiUp[i] = AiUp[i+1] - (nz+1); 521 522 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 523 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 524 } 525 526 /* allocate space for the triangular factor information */ 527 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 528 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 529 530 /* Create the matrix description */ 531 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 532 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 533 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 534 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 535 #else 536 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 537 #endif 538 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 539 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 540 541 /* set the operation */ 542 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 543 544 /* set the matrix */ 545 upTriFactor->csrMat = new CsrMatrix; 546 upTriFactor->csrMat->num_rows = n; 547 upTriFactor->csrMat->num_cols = n; 548 upTriFactor->csrMat->num_entries = nzUpper; 549 550 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 551 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 552 553 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 554 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 555 556 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 557 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 558 559 /* Create the solve analysis information */ 560 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 561 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 562 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 563 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 564 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 565 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 566 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 567 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 568 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 569 #endif 570 571 /* perform the solve analysis */ 572 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 573 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 574 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 575 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 576 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 577 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 578 #endif 579 );CHKERRCUSPARSE(stat); 580 cerr = WaitForCUDA();CHKERRCUDA(cerr); 581 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 582 583 /* assign the pointer */ 584 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 585 upTriFactor->AA_h = AAUp; 586 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 587 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 588 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 589 } else { 590 if (!upTriFactor->AA_h) { 591 cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 592 } 593 /* Fill the upper triangular matrix */ 594 offset = nzUpper; 595 for (i=n-1; i>=0; i--) { 596 v = aa + adiag[i+1] + 1; 597 598 /* number of elements NOT on the diagonal */ 599 nz = adiag[i] - adiag[i+1]-1; 600 601 /* decrement the offset */ 602 offset -= (nz+1); 603 604 /* first, set the diagonal elements */ 605 upTriFactor->AA_h[offset] = 1./v[nz]; 606 ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 607 } 608 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 609 ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 610 } 611 } catch(char *ex) { 612 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 613 } 614 } 615 PetscFunctionReturn(0); 616 } 617 618 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 619 { 620 PetscErrorCode ierr; 621 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 622 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 623 IS isrow = a->row,iscol = a->icol; 624 PetscBool row_identity,col_identity; 625 PetscInt n = A->rmap->n; 626 627 PetscFunctionBegin; 628 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 629 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 630 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 631 632 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 633 cusparseTriFactors->nnz=a->nz; 634 635 A->offloadmask = PETSC_OFFLOAD_BOTH; 636 /* lower triangular indices */ 637 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 638 if (!row_identity && !cusparseTriFactors->rpermIndices) { 639 const PetscInt *r; 640 641 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 642 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 643 cusparseTriFactors->rpermIndices->assign(r, r+n); 644 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 645 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 646 } 647 648 /* upper triangular indices */ 649 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 650 if (!col_identity && !cusparseTriFactors->cpermIndices) { 651 const PetscInt *c; 652 653 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 654 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 655 cusparseTriFactors->cpermIndices->assign(c, c+n); 656 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 657 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 658 } 659 PetscFunctionReturn(0); 660 } 661 662 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 663 { 664 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 665 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 666 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 667 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 668 cusparseStatus_t stat; 669 PetscErrorCode ierr; 670 cudaError_t cerr; 671 PetscInt *AiUp, *AjUp; 672 PetscScalar *AAUp; 673 PetscScalar *AALo; 674 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 675 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 676 const PetscInt *ai = b->i,*aj = b->j,*vj; 677 const MatScalar *aa = b->a,*v; 678 679 PetscFunctionBegin; 680 if (!n) PetscFunctionReturn(0); 681 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 682 try { 683 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 684 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 685 if (!upTriFactor && !loTriFactor) { 686 /* Allocate Space for the upper triangular matrix */ 687 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 688 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 689 690 /* Fill the upper triangular matrix */ 691 AiUp[0]=(PetscInt) 0; 692 AiUp[n]=nzUpper; 693 offset = 0; 694 for (i=0; i<n; i++) { 695 /* set the pointers */ 696 v = aa + ai[i]; 697 vj = aj + ai[i]; 698 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 699 700 /* first, set the diagonal elements */ 701 AjUp[offset] = (PetscInt) i; 702 AAUp[offset] = (MatScalar)1.0/v[nz]; 703 AiUp[i] = offset; 704 AALo[offset] = (MatScalar)1.0/v[nz]; 705 706 offset+=1; 707 if (nz>0) { 708 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 709 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 710 for (j=offset; j<offset+nz; j++) { 711 AAUp[j] = -AAUp[j]; 712 AALo[j] = AAUp[j]/v[nz]; 713 } 714 offset+=nz; 715 } 716 } 717 718 /* allocate space for the triangular factor information */ 719 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 720 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 721 722 /* Create the matrix description */ 723 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 724 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 725 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 726 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 727 #else 728 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 729 #endif 730 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 731 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 732 733 /* set the matrix */ 734 upTriFactor->csrMat = new CsrMatrix; 735 upTriFactor->csrMat->num_rows = A->rmap->n; 736 upTriFactor->csrMat->num_cols = A->cmap->n; 737 upTriFactor->csrMat->num_entries = a->nz; 738 739 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 740 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 741 742 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 743 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 744 745 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 746 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 747 748 /* set the operation */ 749 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 750 751 /* Create the solve analysis information */ 752 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 753 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 754 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 755 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 756 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 757 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 758 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 759 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 760 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 761 #endif 762 763 /* perform the solve analysis */ 764 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 765 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 766 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 767 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 768 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 769 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 770 #endif 771 );CHKERRCUSPARSE(stat); 772 cerr = WaitForCUDA();CHKERRCUDA(cerr); 773 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 774 775 /* assign the pointer */ 776 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 777 778 /* allocate space for the triangular factor information */ 779 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 780 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 781 782 /* Create the matrix description */ 783 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 784 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 785 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 786 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 787 #else 788 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 789 #endif 790 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 791 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 792 793 /* set the operation */ 794 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 795 796 /* set the matrix */ 797 loTriFactor->csrMat = new CsrMatrix; 798 loTriFactor->csrMat->num_rows = A->rmap->n; 799 loTriFactor->csrMat->num_cols = A->cmap->n; 800 loTriFactor->csrMat->num_entries = a->nz; 801 802 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 803 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 804 805 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 806 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 807 808 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 809 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 810 811 /* Create the solve analysis information */ 812 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 813 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 814 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 815 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 816 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 817 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 818 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 819 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 820 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 821 #endif 822 823 /* perform the solve analysis */ 824 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 825 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 826 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 827 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 828 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 829 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 830 #endif 831 );CHKERRCUSPARSE(stat); 832 cerr = WaitForCUDA();CHKERRCUDA(cerr); 833 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 834 835 /* assign the pointer */ 836 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 837 838 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 839 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 840 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 841 } else { 842 /* Fill the upper triangular matrix */ 843 offset = 0; 844 for (i=0; i<n; i++) { 845 /* set the pointers */ 846 v = aa + ai[i]; 847 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 848 849 /* first, set the diagonal elements */ 850 AAUp[offset] = 1.0/v[nz]; 851 AALo[offset] = 1.0/v[nz]; 852 853 offset+=1; 854 if (nz>0) { 855 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 856 for (j=offset; j<offset+nz; j++) { 857 AAUp[j] = -AAUp[j]; 858 AALo[j] = AAUp[j]/v[nz]; 859 } 860 offset+=nz; 861 } 862 } 863 if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 864 if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 865 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 866 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 867 ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 868 } 869 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 870 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 871 } catch(char *ex) { 872 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 873 } 874 } 875 PetscFunctionReturn(0); 876 } 877 878 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 879 { 880 PetscErrorCode ierr; 881 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 882 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 883 IS ip = a->row; 884 PetscBool perm_identity; 885 PetscInt n = A->rmap->n; 886 887 PetscFunctionBegin; 888 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 889 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 890 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 891 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 892 893 A->offloadmask = PETSC_OFFLOAD_BOTH; 894 895 /* lower triangular indices */ 896 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 897 if (!perm_identity) { 898 IS iip; 899 const PetscInt *irip,*rip; 900 901 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 902 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 903 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 904 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 905 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 906 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 907 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 908 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 909 ierr = ISDestroy(&iip);CHKERRQ(ierr); 910 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 911 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 912 } 913 PetscFunctionReturn(0); 914 } 915 916 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 917 { 918 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 919 IS isrow = b->row,iscol = b->col; 920 PetscBool row_identity,col_identity; 921 PetscErrorCode ierr; 922 923 PetscFunctionBegin; 924 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 925 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 926 B->offloadmask = PETSC_OFFLOAD_CPU; 927 /* determine which version of MatSolve needs to be used. */ 928 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 929 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 930 if (row_identity && col_identity) { 931 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 932 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 933 B->ops->matsolve = NULL; 934 B->ops->matsolvetranspose = NULL; 935 } else { 936 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 937 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 938 B->ops->matsolve = NULL; 939 B->ops->matsolvetranspose = NULL; 940 } 941 942 /* get the triangular factors */ 943 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 944 PetscFunctionReturn(0); 945 } 946 947 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 948 { 949 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 950 IS ip = b->row; 951 PetscBool perm_identity; 952 PetscErrorCode ierr; 953 954 PetscFunctionBegin; 955 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 956 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 957 B->offloadmask = PETSC_OFFLOAD_CPU; 958 /* determine which version of MatSolve needs to be used. */ 959 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 960 if (perm_identity) { 961 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 962 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 963 B->ops->matsolve = NULL; 964 B->ops->matsolvetranspose = NULL; 965 } else { 966 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 967 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 968 B->ops->matsolve = NULL; 969 B->ops->matsolvetranspose = NULL; 970 } 971 972 /* get the triangular factors */ 973 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 974 PetscFunctionReturn(0); 975 } 976 977 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 978 { 979 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 980 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 981 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 982 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 983 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 984 cusparseStatus_t stat; 985 cusparseIndexBase_t indexBase; 986 cusparseMatrixType_t matrixType; 987 cusparseFillMode_t fillMode; 988 cusparseDiagType_t diagType; 989 cudaError_t cerr; 990 PetscErrorCode ierr; 991 992 PetscFunctionBegin; 993 /* allocate space for the transpose of the lower triangular factor */ 994 ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 995 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 996 997 /* set the matrix descriptors of the lower triangular factor */ 998 matrixType = cusparseGetMatType(loTriFactor->descr); 999 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1000 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1001 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1002 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1003 1004 /* Create the matrix description */ 1005 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 1006 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1007 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1008 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1009 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1010 1011 /* set the operation */ 1012 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1013 1014 /* allocate GPU space for the CSC of the lower triangular factor*/ 1015 loTriFactorT->csrMat = new CsrMatrix; 1016 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1017 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1018 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1019 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1020 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1021 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1022 1023 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1024 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1025 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1026 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1027 loTriFactor->csrMat->values->data().get(), 1028 loTriFactor->csrMat->row_offsets->data().get(), 1029 loTriFactor->csrMat->column_indices->data().get(), 1030 loTriFactorT->csrMat->values->data().get(), 1031 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1032 CUSPARSE_ACTION_NUMERIC,indexBase, 1033 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1034 cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1035 #endif 1036 1037 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1038 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1039 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1040 loTriFactor->csrMat->values->data().get(), 1041 loTriFactor->csrMat->row_offsets->data().get(), 1042 loTriFactor->csrMat->column_indices->data().get(), 1043 loTriFactorT->csrMat->values->data().get(), 1044 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1045 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1046 CUSPARSE_ACTION_NUMERIC, indexBase, 1047 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer 1048 #else 1049 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1050 CUSPARSE_ACTION_NUMERIC, indexBase 1051 #endif 1052 );CHKERRCUSPARSE(stat); 1053 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1054 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1055 1056 /* Create the solve analysis information */ 1057 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1058 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1059 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1060 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1061 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1062 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1063 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1064 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1065 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1066 #endif 1067 1068 /* perform the solve analysis */ 1069 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1070 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1071 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1072 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo 1073 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1074 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1075 #endif 1076 );CHKERRCUSPARSE(stat); 1077 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1078 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1079 1080 /* assign the pointer */ 1081 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1082 1083 /*********************************************/ 1084 /* Now the Transpose of the Upper Tri Factor */ 1085 /*********************************************/ 1086 1087 /* allocate space for the transpose of the upper triangular factor */ 1088 ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1089 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1090 1091 /* set the matrix descriptors of the upper triangular factor */ 1092 matrixType = cusparseGetMatType(upTriFactor->descr); 1093 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1094 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1095 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1096 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1097 1098 /* Create the matrix description */ 1099 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1100 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1101 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1102 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1103 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1104 1105 /* set the operation */ 1106 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1107 1108 /* allocate GPU space for the CSC of the upper triangular factor*/ 1109 upTriFactorT->csrMat = new CsrMatrix; 1110 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1111 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1112 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1113 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1114 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1115 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1116 1117 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1118 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1119 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1120 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1121 upTriFactor->csrMat->values->data().get(), 1122 upTriFactor->csrMat->row_offsets->data().get(), 1123 upTriFactor->csrMat->column_indices->data().get(), 1124 upTriFactorT->csrMat->values->data().get(), 1125 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1126 CUSPARSE_ACTION_NUMERIC,indexBase, 1127 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1128 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1129 #endif 1130 1131 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1132 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1133 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1134 upTriFactor->csrMat->values->data().get(), 1135 upTriFactor->csrMat->row_offsets->data().get(), 1136 upTriFactor->csrMat->column_indices->data().get(), 1137 upTriFactorT->csrMat->values->data().get(), 1138 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1139 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1140 CUSPARSE_ACTION_NUMERIC, indexBase, 1141 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer 1142 #else 1143 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1144 CUSPARSE_ACTION_NUMERIC, indexBase 1145 #endif 1146 );CHKERRCUSPARSE(stat); 1147 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1148 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1149 1150 /* Create the solve analysis information */ 1151 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1152 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1153 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1154 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1155 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1156 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1157 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1158 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1159 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1160 #endif 1161 1162 /* perform the solve analysis */ 1163 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1164 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1165 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1166 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo 1167 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1168 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1169 #endif 1170 );CHKERRCUSPARSE(stat); 1171 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1172 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1173 1174 /* assign the pointer */ 1175 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1176 PetscFunctionReturn(0); 1177 } 1178 1179 struct PetscScalarToPetscInt 1180 { 1181 __host__ __device__ 1182 PetscInt operator()(PetscScalar s) 1183 { 1184 return (PetscInt)PetscRealPart(s); 1185 } 1186 }; 1187 1188 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTransposeForMult(Mat A) 1189 { 1190 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1191 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1192 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1193 cusparseStatus_t stat; 1194 cusparseIndexBase_t indexBase; 1195 cudaError_t err; 1196 PetscErrorCode ierr; 1197 1198 PetscFunctionBegin; 1199 if (!A->form_explicit_transpose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0); 1200 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1201 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1202 if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing mat struct"); 1203 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1204 if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing matTranspose struct"); 1205 if (A->transupdated) PetscFunctionReturn(0); 1206 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1207 if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1208 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1209 } 1210 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1211 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1212 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1213 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1214 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1215 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1216 1217 /* set alpha and beta */ 1218 err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1219 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1220 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1221 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1222 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1223 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1224 1225 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1226 CsrMatrix *matrixT = new CsrMatrix; 1227 matstructT->mat = matrixT; 1228 matrixT->num_rows = A->cmap->n; 1229 matrixT->num_cols = A->rmap->n; 1230 matrixT->num_entries = a->nz; 1231 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1232 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1233 matrixT->values = new THRUSTARRAY(a->nz); 1234 1235 if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 1236 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1237 1238 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1239 stat = cusparseCreateCsr(&matstructT->matDescr, 1240 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1241 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1242 matrixT->values->data().get(), 1243 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1244 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1245 #endif 1246 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1247 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1248 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1249 #else 1250 CsrMatrix *temp = new CsrMatrix; 1251 CsrMatrix *tempT = new CsrMatrix; 1252 /* First convert HYB to CSR */ 1253 temp->num_rows = A->rmap->n; 1254 temp->num_cols = A->cmap->n; 1255 temp->num_entries = a->nz; 1256 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1257 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1258 temp->values = new THRUSTARRAY(a->nz); 1259 1260 stat = cusparse_hyb2csr(cusparsestruct->handle, 1261 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1262 temp->values->data().get(), 1263 temp->row_offsets->data().get(), 1264 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1265 1266 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1267 tempT->num_rows = A->rmap->n; 1268 tempT->num_cols = A->cmap->n; 1269 tempT->num_entries = a->nz; 1270 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1271 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1272 tempT->values = new THRUSTARRAY(a->nz); 1273 1274 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1275 temp->num_cols, temp->num_entries, 1276 temp->values->data().get(), 1277 temp->row_offsets->data().get(), 1278 temp->column_indices->data().get(), 1279 tempT->values->data().get(), 1280 tempT->column_indices->data().get(), 1281 tempT->row_offsets->data().get(), 1282 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1283 1284 /* Last, convert CSC to HYB */ 1285 cusparseHybMat_t hybMat; 1286 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1287 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1288 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1289 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1290 matstructT->descr, tempT->values->data().get(), 1291 tempT->row_offsets->data().get(), 1292 tempT->column_indices->data().get(), 1293 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1294 1295 /* assign the pointer */ 1296 matstructT->mat = hybMat; 1297 A->transupdated = PETSC_TRUE; 1298 /* delete temporaries */ 1299 if (tempT) { 1300 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1301 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1302 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1303 delete (CsrMatrix*) tempT; 1304 } 1305 if (temp) { 1306 if (temp->values) delete (THRUSTARRAY*) temp->values; 1307 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1308 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1309 delete (CsrMatrix*) temp; 1310 } 1311 #endif 1312 } 1313 } 1314 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1315 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1316 CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1317 if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix"); 1318 if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix rows"); 1319 if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix cols"); 1320 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix values"); 1321 if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT"); 1322 if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT rows"); 1323 if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT cols"); 1324 if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT values"); 1325 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1326 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1327 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1328 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 1329 } 1330 if (!cusparsestruct->csr2csc_i) { 1331 THRUSTARRAY csr2csc_a(matrix->num_entries); 1332 PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1333 1334 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1335 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1336 void *csr2cscBuffer; 1337 size_t csr2cscBufferSize; 1338 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1339 A->cmap->n, matrix->num_entries, 1340 matrix->values->data().get(), 1341 cusparsestruct->rowoffsets_gpu->data().get(), 1342 matrix->column_indices->data().get(), 1343 matrixT->values->data().get(), 1344 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1345 CUSPARSE_ACTION_NUMERIC,indexBase, 1346 cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat); 1347 err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err); 1348 #endif 1349 1350 if (matrix->num_entries) { 1351 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1352 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1353 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1354 1355 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1356 should be filled with indexBase. So I just take a shortcut here. 1357 */ 1358 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1359 A->cmap->n,matrix->num_entries, 1360 csr2csc_a.data().get(), 1361 cusparsestruct->rowoffsets_gpu->data().get(), 1362 matrix->column_indices->data().get(), 1363 matrixT->values->data().get(), 1364 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1365 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1366 CUSPARSE_ACTION_NUMERIC,indexBase, 1367 cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat); 1368 #else 1369 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1370 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1371 #endif 1372 } else { 1373 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1374 } 1375 1376 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1377 PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1378 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1379 err = cudaFree(csr2cscBuffer);CHKERRCUDA(err); 1380 #endif 1381 } 1382 PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1383 thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1384 matrixT->values->begin())); 1385 } 1386 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1387 /* the compressed row indices is not used for matTranspose */ 1388 matstructT->cprowIndices = NULL; 1389 /* assign the pointer */ 1390 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1391 A->transupdated = PETSC_TRUE; 1392 PetscFunctionReturn(0); 1393 } 1394 1395 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1396 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1397 { 1398 PetscInt n = xx->map->n; 1399 const PetscScalar *barray; 1400 PetscScalar *xarray; 1401 thrust::device_ptr<const PetscScalar> bGPU; 1402 thrust::device_ptr<PetscScalar> xGPU; 1403 cusparseStatus_t stat; 1404 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1405 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1406 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1407 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1408 PetscErrorCode ierr; 1409 cudaError_t cerr; 1410 1411 PetscFunctionBegin; 1412 /* Analyze the matrix and create the transpose ... on the fly */ 1413 if (!loTriFactorT && !upTriFactorT) { 1414 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1415 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1416 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1417 } 1418 1419 /* Get the GPU pointers */ 1420 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1421 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1422 xGPU = thrust::device_pointer_cast(xarray); 1423 bGPU = thrust::device_pointer_cast(barray); 1424 1425 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1426 /* First, reorder with the row permutation */ 1427 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1428 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1429 xGPU); 1430 1431 /* First, solve U */ 1432 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1433 upTriFactorT->csrMat->num_rows, 1434 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1435 upTriFactorT->csrMat->num_entries, 1436 #endif 1437 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1438 upTriFactorT->csrMat->values->data().get(), 1439 upTriFactorT->csrMat->row_offsets->data().get(), 1440 upTriFactorT->csrMat->column_indices->data().get(), 1441 upTriFactorT->solveInfo, 1442 xarray, tempGPU->data().get() 1443 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1444 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1445 #endif 1446 );CHKERRCUSPARSE(stat); 1447 1448 /* Then, solve L */ 1449 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1450 loTriFactorT->csrMat->num_rows, 1451 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1452 loTriFactorT->csrMat->num_entries, 1453 #endif 1454 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1455 loTriFactorT->csrMat->values->data().get(), 1456 loTriFactorT->csrMat->row_offsets->data().get(), 1457 loTriFactorT->csrMat->column_indices->data().get(), 1458 loTriFactorT->solveInfo, 1459 tempGPU->data().get(), xarray 1460 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1461 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1462 #endif 1463 );CHKERRCUSPARSE(stat); 1464 1465 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1466 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1467 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1468 tempGPU->begin()); 1469 1470 /* Copy the temporary to the full solution. */ 1471 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1472 1473 /* restore */ 1474 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1475 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1476 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1477 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1478 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1479 PetscFunctionReturn(0); 1480 } 1481 1482 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1483 { 1484 const PetscScalar *barray; 1485 PetscScalar *xarray; 1486 cusparseStatus_t stat; 1487 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1488 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1489 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1490 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1491 PetscErrorCode ierr; 1492 cudaError_t cerr; 1493 1494 PetscFunctionBegin; 1495 /* Analyze the matrix and create the transpose ... on the fly */ 1496 if (!loTriFactorT && !upTriFactorT) { 1497 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1498 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1499 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1500 } 1501 1502 /* Get the GPU pointers */ 1503 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1504 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1505 1506 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1507 /* First, solve U */ 1508 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1509 upTriFactorT->csrMat->num_rows, 1510 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1511 upTriFactorT->csrMat->num_entries, 1512 #endif 1513 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1514 upTriFactorT->csrMat->values->data().get(), 1515 upTriFactorT->csrMat->row_offsets->data().get(), 1516 upTriFactorT->csrMat->column_indices->data().get(), 1517 upTriFactorT->solveInfo, 1518 barray, tempGPU->data().get() 1519 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1520 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1521 #endif 1522 );CHKERRCUSPARSE(stat); 1523 1524 /* Then, solve L */ 1525 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1526 loTriFactorT->csrMat->num_rows, 1527 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1528 loTriFactorT->csrMat->num_entries, 1529 #endif 1530 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1531 loTriFactorT->csrMat->values->data().get(), 1532 loTriFactorT->csrMat->row_offsets->data().get(), 1533 loTriFactorT->csrMat->column_indices->data().get(), 1534 loTriFactorT->solveInfo, 1535 tempGPU->data().get(), xarray 1536 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1537 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1538 #endif 1539 );CHKERRCUSPARSE(stat); 1540 1541 /* restore */ 1542 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1543 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1544 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1545 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1546 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1547 PetscFunctionReturn(0); 1548 } 1549 1550 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1551 { 1552 const PetscScalar *barray; 1553 PetscScalar *xarray; 1554 thrust::device_ptr<const PetscScalar> bGPU; 1555 thrust::device_ptr<PetscScalar> xGPU; 1556 cusparseStatus_t stat; 1557 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1558 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1559 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1560 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1561 PetscErrorCode ierr; 1562 cudaError_t cerr; 1563 1564 PetscFunctionBegin; 1565 1566 /* Get the GPU pointers */ 1567 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1568 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1569 xGPU = thrust::device_pointer_cast(xarray); 1570 bGPU = thrust::device_pointer_cast(barray); 1571 1572 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1573 /* First, reorder with the row permutation */ 1574 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1575 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1576 tempGPU->begin()); 1577 1578 /* Next, solve L */ 1579 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1580 loTriFactor->csrMat->num_rows, 1581 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1582 loTriFactor->csrMat->num_entries, 1583 #endif 1584 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1585 loTriFactor->csrMat->values->data().get(), 1586 loTriFactor->csrMat->row_offsets->data().get(), 1587 loTriFactor->csrMat->column_indices->data().get(), 1588 loTriFactor->solveInfo, 1589 tempGPU->data().get(), xarray 1590 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1591 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1592 #endif 1593 );CHKERRCUSPARSE(stat); 1594 1595 /* Then, solve U */ 1596 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1597 upTriFactor->csrMat->num_rows, 1598 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1599 upTriFactor->csrMat->num_entries, 1600 #endif 1601 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1602 upTriFactor->csrMat->values->data().get(), 1603 upTriFactor->csrMat->row_offsets->data().get(), 1604 upTriFactor->csrMat->column_indices->data().get(), 1605 upTriFactor->solveInfo, 1606 xarray, tempGPU->data().get() 1607 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1608 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1609 #endif 1610 );CHKERRCUSPARSE(stat); 1611 1612 /* Last, reorder with the column permutation */ 1613 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1614 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1615 xGPU); 1616 1617 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1618 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1619 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1620 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1621 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1622 PetscFunctionReturn(0); 1623 } 1624 1625 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1626 { 1627 const PetscScalar *barray; 1628 PetscScalar *xarray; 1629 cusparseStatus_t stat; 1630 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1631 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1632 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1633 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1634 PetscErrorCode ierr; 1635 cudaError_t cerr; 1636 1637 PetscFunctionBegin; 1638 /* Get the GPU pointers */ 1639 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1640 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1641 1642 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1643 /* First, solve L */ 1644 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1645 loTriFactor->csrMat->num_rows, 1646 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1647 loTriFactor->csrMat->num_entries, 1648 #endif 1649 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1650 loTriFactor->csrMat->values->data().get(), 1651 loTriFactor->csrMat->row_offsets->data().get(), 1652 loTriFactor->csrMat->column_indices->data().get(), 1653 loTriFactor->solveInfo, 1654 barray, tempGPU->data().get() 1655 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1656 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1657 #endif 1658 );CHKERRCUSPARSE(stat); 1659 1660 /* Next, solve U */ 1661 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1662 upTriFactor->csrMat->num_rows, 1663 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1664 upTriFactor->csrMat->num_entries, 1665 #endif 1666 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1667 upTriFactor->csrMat->values->data().get(), 1668 upTriFactor->csrMat->row_offsets->data().get(), 1669 upTriFactor->csrMat->column_indices->data().get(), 1670 upTriFactor->solveInfo, 1671 tempGPU->data().get(), xarray 1672 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1673 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1674 #endif 1675 );CHKERRCUSPARSE(stat); 1676 1677 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1678 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1679 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1680 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1681 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1682 PetscFunctionReturn(0); 1683 } 1684 1685 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1686 { 1687 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1688 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1689 cudaError_t cerr; 1690 PetscErrorCode ierr; 1691 1692 PetscFunctionBegin; 1693 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1694 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1695 1696 ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1697 cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1698 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1699 ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 1700 ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1701 A->offloadmask = PETSC_OFFLOAD_BOTH; 1702 } 1703 PetscFunctionReturn(0); 1704 } 1705 1706 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1707 { 1708 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1709 PetscErrorCode ierr; 1710 1711 PetscFunctionBegin; 1712 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1713 *array = a->a; 1714 A->offloadmask = PETSC_OFFLOAD_CPU; 1715 PetscFunctionReturn(0); 1716 } 1717 1718 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1719 { 1720 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1721 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1722 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1723 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1724 PetscErrorCode ierr; 1725 cusparseStatus_t stat; 1726 PetscBool both = PETSC_TRUE; 1727 cudaError_t err; 1728 1729 PetscFunctionBegin; 1730 if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU"); 1731 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1732 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1733 CsrMatrix *matrix; 1734 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1735 1736 if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR values"); 1737 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1738 matrix->values->assign(a->a, a->a+a->nz); 1739 err = WaitForCUDA();CHKERRCUDA(err); 1740 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1741 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1742 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 1743 } else { 1744 PetscInt nnz; 1745 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1746 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1747 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1748 delete cusparsestruct->workVector; 1749 delete cusparsestruct->rowoffsets_gpu; 1750 cusparsestruct->workVector = NULL; 1751 cusparsestruct->rowoffsets_gpu = NULL; 1752 try { 1753 if (a->compressedrow.use) { 1754 m = a->compressedrow.nrows; 1755 ii = a->compressedrow.i; 1756 ridx = a->compressedrow.rindex; 1757 } else { 1758 m = A->rmap->n; 1759 ii = a->i; 1760 ridx = NULL; 1761 } 1762 if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR row data"); 1763 if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR column data"); 1764 if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1765 else nnz = a->nz; 1766 1767 /* create cusparse matrix */ 1768 cusparsestruct->nrows = m; 1769 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1770 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1771 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1772 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1773 1774 err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1775 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1776 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1777 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1778 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1779 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1780 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1781 1782 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1783 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1784 /* set the matrix */ 1785 CsrMatrix *mat= new CsrMatrix; 1786 mat->num_rows = m; 1787 mat->num_cols = A->cmap->n; 1788 mat->num_entries = nnz; 1789 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1790 mat->row_offsets->assign(ii, ii + m+1); 1791 1792 mat->column_indices = new THRUSTINTARRAY32(nnz); 1793 mat->column_indices->assign(a->j, a->j+nnz); 1794 1795 mat->values = new THRUSTARRAY(nnz); 1796 if (a->a) mat->values->assign(a->a, a->a+nnz); 1797 1798 /* assign the pointer */ 1799 matstruct->mat = mat; 1800 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1801 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1802 stat = cusparseCreateCsr(&matstruct->matDescr, 1803 mat->num_rows, mat->num_cols, mat->num_entries, 1804 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1805 mat->values->data().get(), 1806 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1807 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1808 } 1809 #endif 1810 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1811 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1812 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1813 #else 1814 CsrMatrix *mat= new CsrMatrix; 1815 mat->num_rows = m; 1816 mat->num_cols = A->cmap->n; 1817 mat->num_entries = nnz; 1818 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1819 mat->row_offsets->assign(ii, ii + m+1); 1820 1821 mat->column_indices = new THRUSTINTARRAY32(nnz); 1822 mat->column_indices->assign(a->j, a->j+nnz); 1823 1824 mat->values = new THRUSTARRAY(nnz); 1825 if (a->a) mat->values->assign(a->a, a->a+nnz); 1826 1827 cusparseHybMat_t hybMat; 1828 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1829 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1830 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1831 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1832 matstruct->descr, mat->values->data().get(), 1833 mat->row_offsets->data().get(), 1834 mat->column_indices->data().get(), 1835 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1836 /* assign the pointer */ 1837 matstruct->mat = hybMat; 1838 1839 if (mat) { 1840 if (mat->values) delete (THRUSTARRAY*)mat->values; 1841 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1842 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1843 delete (CsrMatrix*)mat; 1844 } 1845 #endif 1846 } 1847 1848 /* assign the compressed row indices */ 1849 if (a->compressedrow.use) { 1850 cusparsestruct->workVector = new THRUSTARRAY(m); 1851 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1852 matstruct->cprowIndices->assign(ridx,ridx+m); 1853 tmp = m; 1854 } else { 1855 cusparsestruct->workVector = NULL; 1856 matstruct->cprowIndices = NULL; 1857 tmp = 0; 1858 } 1859 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1860 1861 /* assign the pointer */ 1862 cusparsestruct->mat = matstruct; 1863 } catch(char *ex) { 1864 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1865 } 1866 err = WaitForCUDA();CHKERRCUDA(err); 1867 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1868 cusparsestruct->nonzerostate = A->nonzerostate; 1869 } 1870 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 1871 } 1872 PetscFunctionReturn(0); 1873 } 1874 1875 struct VecCUDAPlusEquals 1876 { 1877 template <typename Tuple> 1878 __host__ __device__ 1879 void operator()(Tuple t) 1880 { 1881 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1882 } 1883 }; 1884 1885 struct VecCUDAEquals 1886 { 1887 template <typename Tuple> 1888 __host__ __device__ 1889 void operator()(Tuple t) 1890 { 1891 thrust::get<1>(t) = thrust::get<0>(t); 1892 } 1893 }; 1894 1895 struct VecCUDAEqualsReverse 1896 { 1897 template <typename Tuple> 1898 __host__ __device__ 1899 void operator()(Tuple t) 1900 { 1901 thrust::get<0>(t) = thrust::get<1>(t); 1902 } 1903 }; 1904 1905 struct MatMatCusparse { 1906 PetscBool cisdense; 1907 PetscScalar *Bt; 1908 Mat X; 1909 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 1910 PetscLogDouble flops; 1911 CsrMatrix *Bcsr; 1912 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1913 cusparseSpMatDescr_t matSpBDescr; 1914 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1915 cusparseDnMatDescr_t matBDescr; 1916 cusparseDnMatDescr_t matCDescr; 1917 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1918 size_t mmBufferSize; 1919 void *mmBuffer; 1920 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 1921 cusparseSpGEMMDescr_t spgemmDesc; 1922 #endif 1923 }; 1924 1925 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1926 { 1927 PetscErrorCode ierr; 1928 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1929 cudaError_t cerr; 1930 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1931 cusparseStatus_t stat; 1932 #endif 1933 1934 PetscFunctionBegin; 1935 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1936 delete mmdata->Bcsr; 1937 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1938 if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); } 1939 if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); } 1940 if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); } 1941 if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); } 1942 if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); } 1943 if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); } 1944 #endif 1945 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1946 ierr = PetscFree(data);CHKERRQ(ierr); 1947 PetscFunctionReturn(0); 1948 } 1949 1950 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1951 1952 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1953 { 1954 Mat_Product *product = C->product; 1955 Mat A,B; 1956 PetscInt m,n,blda,clda; 1957 PetscBool flg,biscuda; 1958 Mat_SeqAIJCUSPARSE *cusp; 1959 cusparseStatus_t stat; 1960 cusparseOperation_t opA; 1961 const PetscScalar *barray; 1962 PetscScalar *carray; 1963 PetscErrorCode ierr; 1964 MatMatCusparse *mmdata; 1965 Mat_SeqAIJCUSPARSEMultStruct *mat; 1966 CsrMatrix *csrmat; 1967 cudaError_t cerr; 1968 1969 PetscFunctionBegin; 1970 MatCheckProduct(C,1); 1971 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 1972 mmdata = (MatMatCusparse*)product->data; 1973 A = product->A; 1974 B = product->B; 1975 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1976 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1977 /* currently CopyToGpu does not copy if the matrix is bound to CPU 1978 Instead of silently accepting the wrong answer, I prefer to raise the error */ 1979 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 1980 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1981 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1982 switch (product->type) { 1983 case MATPRODUCT_AB: 1984 case MATPRODUCT_PtAP: 1985 mat = cusp->mat; 1986 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1987 m = A->rmap->n; 1988 n = B->cmap->n; 1989 break; 1990 case MATPRODUCT_AtB: 1991 if (!A->form_explicit_transpose) { 1992 mat = cusp->mat; 1993 opA = CUSPARSE_OPERATION_TRANSPOSE; 1994 } else { 1995 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 1996 mat = cusp->matTranspose; 1997 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1998 } 1999 m = A->cmap->n; 2000 n = B->cmap->n; 2001 break; 2002 case MATPRODUCT_ABt: 2003 case MATPRODUCT_RARt: 2004 mat = cusp->mat; 2005 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2006 m = A->rmap->n; 2007 n = B->rmap->n; 2008 break; 2009 default: 2010 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2011 } 2012 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2013 csrmat = (CsrMatrix*)mat->mat; 2014 /* if the user passed a CPU matrix, copy the data to the GPU */ 2015 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 2016 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 2017 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 2018 2019 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 2020 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2021 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2022 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 2023 } else { 2024 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 2025 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 2026 } 2027 2028 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2029 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2030 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2031 /* (re)allcoate mmBuffer if not initialized or LDAs are different */ 2032 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2033 size_t mmBufferSize; 2034 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 2035 if (!mmdata->matBDescr) { 2036 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2037 mmdata->Blda = blda; 2038 } 2039 2040 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 2041 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2042 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2043 mmdata->Clda = clda; 2044 } 2045 2046 if (!mat->matDescr) { 2047 stat = cusparseCreateCsr(&mat->matDescr, 2048 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2049 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2050 csrmat->values->data().get(), 2051 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2052 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2053 } 2054 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2055 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2056 mmdata->matCDescr,cusparse_scalartype, 2057 cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2058 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2059 cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); 2060 cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr); 2061 mmdata->mmBufferSize = mmBufferSize; 2062 } 2063 mmdata->initialized = PETSC_TRUE; 2064 } else { 2065 /* to be safe, always update pointers of the mats */ 2066 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2067 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2068 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2069 } 2070 2071 /* do cusparseSpMM, which supports transpose on B */ 2072 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2073 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2074 mmdata->matCDescr,cusparse_scalartype, 2075 cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2076 #else 2077 PetscInt k; 2078 /* cusparseXcsrmm does not support transpose on B */ 2079 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2080 cublasHandle_t cublasv2handle; 2081 cublasStatus_t cerr; 2082 2083 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2084 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2085 B->cmap->n,B->rmap->n, 2086 &PETSC_CUSPARSE_ONE ,barray,blda, 2087 &PETSC_CUSPARSE_ZERO,barray,blda, 2088 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2089 blda = B->cmap->n; 2090 k = B->cmap->n; 2091 } else { 2092 k = B->rmap->n; 2093 } 2094 2095 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2096 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2097 csrmat->num_entries,mat->alpha_one,mat->descr, 2098 csrmat->values->data().get(), 2099 csrmat->row_offsets->data().get(), 2100 csrmat->column_indices->data().get(), 2101 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2102 carray,clda);CHKERRCUSPARSE(stat); 2103 #endif 2104 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2105 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2106 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2107 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2108 if (product->type == MATPRODUCT_RARt) { 2109 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2110 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2111 } else if (product->type == MATPRODUCT_PtAP) { 2112 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2113 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2114 } else { 2115 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2116 } 2117 if (mmdata->cisdense) { 2118 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2119 } 2120 if (!biscuda) { 2121 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2122 } 2123 PetscFunctionReturn(0); 2124 } 2125 2126 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2127 { 2128 Mat_Product *product = C->product; 2129 Mat A,B; 2130 PetscInt m,n; 2131 PetscBool cisdense,flg; 2132 PetscErrorCode ierr; 2133 MatMatCusparse *mmdata; 2134 Mat_SeqAIJCUSPARSE *cusp; 2135 2136 PetscFunctionBegin; 2137 MatCheckProduct(C,1); 2138 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 2139 A = product->A; 2140 B = product->B; 2141 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2142 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2143 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2144 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2145 switch (product->type) { 2146 case MATPRODUCT_AB: 2147 m = A->rmap->n; 2148 n = B->cmap->n; 2149 break; 2150 case MATPRODUCT_AtB: 2151 m = A->cmap->n; 2152 n = B->cmap->n; 2153 break; 2154 case MATPRODUCT_ABt: 2155 m = A->rmap->n; 2156 n = B->rmap->n; 2157 break; 2158 case MATPRODUCT_PtAP: 2159 m = B->cmap->n; 2160 n = B->cmap->n; 2161 break; 2162 case MATPRODUCT_RARt: 2163 m = B->rmap->n; 2164 n = B->rmap->n; 2165 break; 2166 default: 2167 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2168 } 2169 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2170 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2171 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2172 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2173 2174 /* product data */ 2175 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2176 mmdata->cisdense = cisdense; 2177 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2178 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2179 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2180 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2181 } 2182 #endif 2183 /* for these products we need intermediate storage */ 2184 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2185 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2186 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2187 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2188 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2189 } else { 2190 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2191 } 2192 } 2193 C->product->data = mmdata; 2194 C->product->destroy = MatDestroy_MatMatCusparse; 2195 2196 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2197 PetscFunctionReturn(0); 2198 } 2199 2200 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2201 { 2202 Mat_Product *product = C->product; 2203 Mat A,B; 2204 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2205 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2206 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2207 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2208 PetscBool flg; 2209 PetscErrorCode ierr; 2210 cusparseStatus_t stat; 2211 cudaError_t cerr; 2212 MatProductType ptype; 2213 MatMatCusparse *mmdata; 2214 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2215 cusparseSpMatDescr_t BmatSpDescr; 2216 #endif 2217 2218 PetscFunctionBegin; 2219 MatCheckProduct(C,1); 2220 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 2221 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2222 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name); 2223 mmdata = (MatMatCusparse*)C->product->data; 2224 A = product->A; 2225 B = product->B; 2226 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2227 mmdata->reusesym = PETSC_FALSE; 2228 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2229 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2230 Cmat = Ccusp->mat; 2231 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2232 Ccsr = (CsrMatrix*)Cmat->mat; 2233 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct"); 2234 goto finalize; 2235 } 2236 if (!c->nz) goto finalize; 2237 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2238 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2239 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2240 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name); 2241 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2242 if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2243 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2244 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2245 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2246 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2247 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2248 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2249 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2250 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2251 2252 ptype = product->type; 2253 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2254 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2255 switch (ptype) { 2256 case MATPRODUCT_AB: 2257 Amat = Acusp->mat; 2258 Bmat = Bcusp->mat; 2259 break; 2260 case MATPRODUCT_AtB: 2261 Amat = Acusp->matTranspose; 2262 Bmat = Bcusp->mat; 2263 break; 2264 case MATPRODUCT_ABt: 2265 Amat = Acusp->mat; 2266 Bmat = Bcusp->matTranspose; 2267 break; 2268 default: 2269 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2270 } 2271 Cmat = Ccusp->mat; 2272 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2273 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2274 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2275 Acsr = (CsrMatrix*)Amat->mat; 2276 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2277 Ccsr = (CsrMatrix*)Cmat->mat; 2278 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct"); 2279 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct"); 2280 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct"); 2281 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2282 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2283 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2284 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2285 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2286 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2287 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2288 stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2289 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2290 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2291 #else 2292 stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2293 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2294 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2295 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2296 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2297 #endif 2298 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2299 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2300 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2301 C->offloadmask = PETSC_OFFLOAD_GPU; 2302 finalize: 2303 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2304 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 2305 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 2306 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 2307 c->reallocs = 0; 2308 C->info.mallocs += 0; 2309 C->info.nz_unneeded = 0; 2310 C->assembled = C->was_assembled = PETSC_TRUE; 2311 C->num_ass++; 2312 PetscFunctionReturn(0); 2313 } 2314 2315 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2316 { 2317 Mat_Product *product = C->product; 2318 Mat A,B; 2319 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2320 Mat_SeqAIJ *a,*b,*c; 2321 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2322 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2323 PetscInt i,j,m,n,k; 2324 PetscBool flg; 2325 PetscErrorCode ierr; 2326 cusparseStatus_t stat; 2327 cudaError_t cerr; 2328 MatProductType ptype; 2329 MatMatCusparse *mmdata; 2330 PetscLogDouble flops; 2331 PetscBool biscompressed,ciscompressed; 2332 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2333 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2334 size_t bufSize2; 2335 cusparseSpMatDescr_t BmatSpDescr; 2336 #else 2337 int cnz; 2338 #endif 2339 2340 PetscFunctionBegin; 2341 MatCheckProduct(C,1); 2342 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 2343 A = product->A; 2344 B = product->B; 2345 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2346 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2347 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2348 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name); 2349 a = (Mat_SeqAIJ*)A->data; 2350 b = (Mat_SeqAIJ*)B->data; 2351 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2352 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2353 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2354 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2355 2356 /* product data */ 2357 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2358 C->product->data = mmdata; 2359 C->product->destroy = MatDestroy_MatMatCusparse; 2360 2361 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2362 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2363 ptype = product->type; 2364 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2365 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2366 biscompressed = PETSC_FALSE; 2367 ciscompressed = PETSC_FALSE; 2368 switch (ptype) { 2369 case MATPRODUCT_AB: 2370 m = A->rmap->n; 2371 n = B->cmap->n; 2372 k = A->cmap->n; 2373 Amat = Acusp->mat; 2374 Bmat = Bcusp->mat; 2375 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2376 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2377 break; 2378 case MATPRODUCT_AtB: 2379 m = A->cmap->n; 2380 n = B->cmap->n; 2381 k = A->rmap->n; 2382 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 2383 Amat = Acusp->matTranspose; 2384 Bmat = Bcusp->mat; 2385 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2386 break; 2387 case MATPRODUCT_ABt: 2388 m = A->rmap->n; 2389 n = B->rmap->n; 2390 k = A->cmap->n; 2391 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr); 2392 Amat = Acusp->mat; 2393 Bmat = Bcusp->matTranspose; 2394 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2395 break; 2396 default: 2397 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2398 } 2399 2400 /* create cusparse matrix */ 2401 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2402 ierr = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2403 c = (Mat_SeqAIJ*)C->data; 2404 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2405 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2406 Ccsr = new CsrMatrix; 2407 2408 c->compressedrow.use = ciscompressed; 2409 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2410 c->compressedrow.nrows = a->compressedrow.nrows; 2411 ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr); 2412 ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr); 2413 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2414 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2415 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2416 } else { 2417 c->compressedrow.nrows = 0; 2418 c->compressedrow.i = NULL; 2419 c->compressedrow.rindex = NULL; 2420 Ccusp->workVector = NULL; 2421 Cmat->cprowIndices = NULL; 2422 } 2423 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2424 Ccusp->mat = Cmat; 2425 Ccusp->mat->mat = Ccsr; 2426 Ccsr->num_rows = Ccusp->nrows; 2427 Ccsr->num_cols = n; 2428 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2429 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 2430 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 2431 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 2432 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 2433 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 2434 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 2435 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2436 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2437 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2438 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2439 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2440 c->nz = 0; 2441 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2442 Ccsr->values = new THRUSTARRAY(c->nz); 2443 goto finalizesym; 2444 } 2445 2446 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2447 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2448 Acsr = (CsrMatrix*)Amat->mat; 2449 if (!biscompressed) { 2450 Bcsr = (CsrMatrix*)Bmat->mat; 2451 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2452 BmatSpDescr = Bmat->matDescr; 2453 #endif 2454 } else { /* we need to use row offsets for the full matrix */ 2455 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2456 Bcsr = new CsrMatrix; 2457 Bcsr->num_rows = B->rmap->n; 2458 Bcsr->num_cols = cBcsr->num_cols; 2459 Bcsr->num_entries = cBcsr->num_entries; 2460 Bcsr->column_indices = cBcsr->column_indices; 2461 Bcsr->values = cBcsr->values; 2462 if (!Bcusp->rowoffsets_gpu) { 2463 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2464 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2465 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 2466 } 2467 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2468 mmdata->Bcsr = Bcsr; 2469 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2470 if (Bcsr->num_rows && Bcsr->num_cols) { 2471 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2472 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2473 Bcsr->values->data().get(), 2474 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2475 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2476 } 2477 BmatSpDescr = mmdata->matSpBDescr; 2478 #endif 2479 } 2480 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct"); 2481 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct"); 2482 /* precompute flops count */ 2483 if (ptype == MATPRODUCT_AB) { 2484 for (i=0, flops = 0; i<A->rmap->n; i++) { 2485 const PetscInt st = a->i[i]; 2486 const PetscInt en = a->i[i+1]; 2487 for (j=st; j<en; j++) { 2488 const PetscInt brow = a->j[j]; 2489 flops += 2.*(b->i[brow+1] - b->i[brow]); 2490 } 2491 } 2492 } else if (ptype == MATPRODUCT_AtB) { 2493 for (i=0, flops = 0; i<A->rmap->n; i++) { 2494 const PetscInt anzi = a->i[i+1] - a->i[i]; 2495 const PetscInt bnzi = b->i[i+1] - b->i[i]; 2496 flops += (2.*anzi)*bnzi; 2497 } 2498 } else { /* TODO */ 2499 flops = 0.; 2500 } 2501 2502 mmdata->flops = flops; 2503 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2504 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2505 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2506 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2507 NULL, NULL, NULL, 2508 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2509 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2510 stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2511 /* ask bufferSize bytes for external memory */ 2512 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2513 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2514 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2515 mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2516 cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr); 2517 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2518 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2519 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2520 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2521 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2522 /* ask bufferSize again bytes for external memory */ 2523 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2524 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2525 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2526 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2527 /* The CUSPARSE documentation is not clear, nor the API 2528 We need both buffers to perform the operations properly! 2529 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2530 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2531 is stored in the descriptor! What a messy API... */ 2532 cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr); 2533 /* compute the intermediate product of A * B */ 2534 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2535 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2536 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2537 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2538 /* get matrix C non-zero entries C_nnz1 */ 2539 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2540 c->nz = (PetscInt) C_nnz1; 2541 ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr); 2542 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2543 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2544 Ccsr->values = new THRUSTARRAY(c->nz); 2545 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2546 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2547 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2548 stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2549 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2550 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2551 #else 2552 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 2553 stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2554 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2555 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2556 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2557 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2558 c->nz = cnz; 2559 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2560 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2561 Ccsr->values = new THRUSTARRAY(c->nz); 2562 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2563 2564 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2565 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2566 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2567 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2568 stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2569 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2570 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2571 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2572 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2573 #endif 2574 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2575 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2576 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2577 finalizesym: 2578 c->singlemalloc = PETSC_FALSE; 2579 c->free_a = PETSC_TRUE; 2580 c->free_ij = PETSC_TRUE; 2581 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 2582 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 2583 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2584 PetscInt *d_i = c->i; 2585 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2586 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2587 ii = *Ccsr->row_offsets; 2588 jj = *Ccsr->column_indices; 2589 if (ciscompressed) d_i = c->compressedrow.i; 2590 cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2591 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2592 } else { 2593 PetscInt *d_i = c->i; 2594 if (ciscompressed) d_i = c->compressedrow.i; 2595 cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2596 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2597 } 2598 if (ciscompressed) { /* need to expand host row offsets */ 2599 PetscInt r = 0; 2600 c->i[0] = 0; 2601 for (k = 0; k < c->compressedrow.nrows; k++) { 2602 const PetscInt next = c->compressedrow.rindex[k]; 2603 const PetscInt old = c->compressedrow.i[k]; 2604 for (; r < next; r++) c->i[r+1] = old; 2605 } 2606 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2607 } 2608 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 2609 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 2610 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 2611 c->maxnz = c->nz; 2612 c->nonzerorowcnt = 0; 2613 c->rmax = 0; 2614 for (k = 0; k < m; k++) { 2615 const PetscInt nn = c->i[k+1] - c->i[k]; 2616 c->ilen[k] = c->imax[k] = nn; 2617 c->nonzerorowcnt += (PetscInt)!!nn; 2618 c->rmax = PetscMax(c->rmax,nn); 2619 } 2620 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 2621 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 2622 Ccsr->num_entries = c->nz; 2623 2624 C->nonzerostate++; 2625 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 2626 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 2627 Ccusp->nonzerostate = C->nonzerostate; 2628 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2629 C->preallocated = PETSC_TRUE; 2630 C->assembled = PETSC_FALSE; 2631 C->was_assembled = PETSC_FALSE; 2632 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2633 mmdata->reusesym = PETSC_TRUE; 2634 C->offloadmask = PETSC_OFFLOAD_GPU; 2635 } 2636 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2637 PetscFunctionReturn(0); 2638 } 2639 2640 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2641 2642 /* handles sparse or dense B */ 2643 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2644 { 2645 Mat_Product *product = mat->product; 2646 PetscErrorCode ierr; 2647 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2648 2649 PetscFunctionBegin; 2650 MatCheckProduct(mat,1); 2651 ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 2652 if (!product->A->boundtocpu && !product->B->boundtocpu) { 2653 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 2654 } 2655 if (product->type == MATPRODUCT_ABC) { 2656 Ciscusp = PETSC_FALSE; 2657 if (!product->C->boundtocpu) { 2658 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr); 2659 } 2660 } 2661 if (isdense) { 2662 switch (product->type) { 2663 case MATPRODUCT_AB: 2664 case MATPRODUCT_AtB: 2665 case MATPRODUCT_ABt: 2666 case MATPRODUCT_PtAP: 2667 case MATPRODUCT_RARt: 2668 if (product->A->boundtocpu) { 2669 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr); 2670 } else { 2671 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2672 } 2673 break; 2674 case MATPRODUCT_ABC: 2675 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2676 break; 2677 default: 2678 break; 2679 } 2680 } else if (Biscusp && Ciscusp) { 2681 switch (product->type) { 2682 case MATPRODUCT_AB: 2683 case MATPRODUCT_AtB: 2684 case MATPRODUCT_ABt: 2685 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2686 break; 2687 case MATPRODUCT_PtAP: 2688 case MATPRODUCT_RARt: 2689 case MATPRODUCT_ABC: 2690 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2691 break; 2692 default: 2693 break; 2694 } 2695 } else { /* fallback for AIJ */ 2696 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 2697 } 2698 PetscFunctionReturn(0); 2699 } 2700 2701 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2702 { 2703 PetscErrorCode ierr; 2704 2705 PetscFunctionBegin; 2706 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2707 PetscFunctionReturn(0); 2708 } 2709 2710 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2711 { 2712 PetscErrorCode ierr; 2713 2714 PetscFunctionBegin; 2715 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2716 PetscFunctionReturn(0); 2717 } 2718 2719 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2720 { 2721 PetscErrorCode ierr; 2722 2723 PetscFunctionBegin; 2724 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2725 PetscFunctionReturn(0); 2726 } 2727 2728 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2729 { 2730 PetscErrorCode ierr; 2731 2732 PetscFunctionBegin; 2733 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2734 PetscFunctionReturn(0); 2735 } 2736 2737 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2738 { 2739 PetscErrorCode ierr; 2740 2741 PetscFunctionBegin; 2742 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2743 PetscFunctionReturn(0); 2744 } 2745 2746 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 2747 { 2748 int i = blockIdx.x*blockDim.x + threadIdx.x; 2749 if (i < n) y[idx[i]] += x[i]; 2750 } 2751 2752 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2753 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 2754 { 2755 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2756 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2757 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2758 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2759 PetscErrorCode ierr; 2760 cudaError_t cerr; 2761 cusparseStatus_t stat; 2762 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2763 PetscBool compressed; 2764 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2765 PetscInt nx,ny; 2766 #endif 2767 2768 PetscFunctionBegin; 2769 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported"); 2770 if (!a->nonzerorowcnt) { 2771 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2772 else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 2773 PetscFunctionReturn(0); 2774 } 2775 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 2776 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2777 if (!trans) { 2778 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2779 if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 2780 } else { 2781 if (herm || !A->form_explicit_transpose) { 2782 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2783 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2784 } else { 2785 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);} 2786 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2787 } 2788 } 2789 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2790 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2791 2792 try { 2793 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2794 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 2795 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 2796 2797 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2798 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2799 /* z = A x + beta y. 2800 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 2801 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 2802 */ 2803 xptr = xarray; 2804 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 2805 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 2806 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2807 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 2808 allocated to accommodate different uses. So we get the length info directly from mat. 2809 */ 2810 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2811 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2812 nx = mat->num_cols; 2813 ny = mat->num_rows; 2814 } 2815 #endif 2816 } else { 2817 /* z = A^T x + beta y 2818 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 2819 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 2820 */ 2821 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 2822 dptr = zarray; 2823 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 2824 if (compressed) { /* Scatter x to work vector */ 2825 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 2826 thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 2827 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2828 VecCUDAEqualsReverse()); 2829 } 2830 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2831 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2832 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2833 nx = mat->num_rows; 2834 ny = mat->num_cols; 2835 } 2836 #endif 2837 } 2838 2839 /* csr_spmv does y = alpha op(A) x + beta y */ 2840 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2841 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2842 if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 2843 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 2844 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2845 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2846 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 2847 matstruct->matDescr, 2848 matstruct->cuSpMV[opA].vecXDescr, beta, 2849 matstruct->cuSpMV[opA].vecYDescr, 2850 cusparse_scalartype, 2851 cusparsestruct->spmvAlg, 2852 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 2853 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 2854 2855 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 2856 } else { 2857 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 2858 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 2859 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 2860 } 2861 2862 stat = cusparseSpMV(cusparsestruct->handle, opA, 2863 matstruct->alpha_one, 2864 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTransposeForMult() */ 2865 matstruct->cuSpMV[opA].vecXDescr, 2866 beta, 2867 matstruct->cuSpMV[opA].vecYDescr, 2868 cusparse_scalartype, 2869 cusparsestruct->spmvAlg, 2870 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 2871 #else 2872 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2873 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 2874 mat->num_rows, mat->num_cols, 2875 mat->num_entries, matstruct->alpha_one, matstruct->descr, 2876 mat->values->data().get(), mat->row_offsets->data().get(), 2877 mat->column_indices->data().get(), xptr, beta, 2878 dptr);CHKERRCUSPARSE(stat); 2879 #endif 2880 } else { 2881 if (cusparsestruct->nrows) { 2882 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2883 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2884 #else 2885 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 2886 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 2887 matstruct->alpha_one, matstruct->descr, hybMat, 2888 xptr, beta, 2889 dptr);CHKERRCUSPARSE(stat); 2890 #endif 2891 } 2892 } 2893 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2894 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2895 2896 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2897 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 2898 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 2899 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 2900 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 2901 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2902 } 2903 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 2904 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 2905 } 2906 2907 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 2908 if (compressed) { 2909 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2910 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 2911 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 2912 prevent that. So I just add a ScatterAdd kernel. 2913 */ 2914 #if 0 2915 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 2916 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 2917 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 2918 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2919 VecCUDAPlusEquals()); 2920 #else 2921 PetscInt n = matstruct->cprowIndices->size(); 2922 ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 2923 #endif 2924 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2925 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2926 } 2927 } else { 2928 if (yy && yy != zz) { 2929 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2930 } 2931 } 2932 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2933 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 2934 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 2935 } catch(char *ex) { 2936 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 2937 } 2938 if (yy) { 2939 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 2940 } else { 2941 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 2942 } 2943 PetscFunctionReturn(0); 2944 } 2945 2946 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2947 { 2948 PetscErrorCode ierr; 2949 2950 PetscFunctionBegin; 2951 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2952 PetscFunctionReturn(0); 2953 } 2954 2955 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 2956 { 2957 PetscErrorCode ierr; 2958 PetscSplitCSRDataStructure *d_mat = NULL; 2959 PetscFunctionBegin; 2960 if (A->factortype == MAT_FACTOR_NONE) { 2961 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2962 } 2963 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it? 2964 if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 2965 if (d_mat) { 2966 A->offloadmask = PETSC_OFFLOAD_GPU; 2967 } 2968 2969 PetscFunctionReturn(0); 2970 } 2971 2972 /* --------------------------------------------------------------------------------*/ 2973 /*@ 2974 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 2975 (the default parallel PETSc format). This matrix will ultimately pushed down 2976 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 2977 assembly performance the user should preallocate the matrix storage by setting 2978 the parameter nz (or the array nnz). By setting these parameters accurately, 2979 performance during matrix assembly can be increased by more than a factor of 50. 2980 2981 Collective 2982 2983 Input Parameters: 2984 + comm - MPI communicator, set to PETSC_COMM_SELF 2985 . m - number of rows 2986 . n - number of columns 2987 . nz - number of nonzeros per row (same for all rows) 2988 - nnz - array containing the number of nonzeros in the various rows 2989 (possibly different for each row) or NULL 2990 2991 Output Parameter: 2992 . A - the matrix 2993 2994 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 2995 MatXXXXSetPreallocation() paradgm instead of this routine directly. 2996 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 2997 2998 Notes: 2999 If nnz is given then nz is ignored 3000 3001 The AIJ format (also called the Yale sparse matrix format or 3002 compressed row storage), is fully compatible with standard Fortran 77 3003 storage. That is, the stored row and column indices can begin at 3004 either one (as in Fortran) or zero. See the users' manual for details. 3005 3006 Specify the preallocated storage with either nz or nnz (not both). 3007 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 3008 allocation. For large problems you MUST preallocate memory or you 3009 will get TERRIBLE performance, see the users' manual chapter on matrices. 3010 3011 By default, this format uses inodes (identical nodes) when possible, to 3012 improve numerical efficiency of matrix-vector products and solves. We 3013 search for consecutive rows with the same nonzero structure, thereby 3014 reusing matrix information to achieve increased efficiency. 3015 3016 Level: intermediate 3017 3018 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 3019 @*/ 3020 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 3021 { 3022 PetscErrorCode ierr; 3023 3024 PetscFunctionBegin; 3025 ierr = MatCreate(comm,A);CHKERRQ(ierr); 3026 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 3027 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3028 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 3029 PetscFunctionReturn(0); 3030 } 3031 3032 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3033 { 3034 PetscErrorCode ierr; 3035 PetscSplitCSRDataStructure *d_mat = NULL; 3036 3037 PetscFunctionBegin; 3038 if (A->factortype == MAT_FACTOR_NONE) { 3039 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 3040 ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL; 3041 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 3042 } else { 3043 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 3044 } 3045 if (d_mat) { 3046 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3047 cudaError_t err; 3048 PetscSplitCSRDataStructure h_mat; 3049 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 3050 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 3051 if (a->compressedrow.use) { 3052 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 3053 } 3054 err = cudaFree(d_mat);CHKERRCUDA(err); 3055 } 3056 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3057 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 3058 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3059 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3060 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3061 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 3062 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3063 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3064 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 3065 PetscFunctionReturn(0); 3066 } 3067 3068 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 3069 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 3070 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 3071 { 3072 PetscErrorCode ierr; 3073 3074 PetscFunctionBegin; 3075 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3076 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 3077 PetscFunctionReturn(0); 3078 } 3079 3080 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 3081 { 3082 PetscErrorCode ierr; 3083 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3084 Mat_SeqAIJCUSPARSE *cy; 3085 Mat_SeqAIJCUSPARSE *cx; 3086 PetscScalar *ay; 3087 const PetscScalar *ax; 3088 CsrMatrix *csry,*csrx; 3089 cudaError_t cerr; 3090 3091 PetscFunctionBegin; 3092 cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3093 cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3094 if (X->ops->axpy != Y->ops->axpy) { 3095 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3096 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3097 PetscFunctionReturn(0); 3098 } 3099 /* if we are here, it means both matrices are bound to GPU */ 3100 ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 3101 ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 3102 if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported"); 3103 if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported"); 3104 csry = (CsrMatrix*)cy->mat->mat; 3105 csrx = (CsrMatrix*)cx->mat->mat; 3106 /* see if we can turn this into a cublas axpy */ 3107 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3108 bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3109 if (eq) { 3110 eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3111 } 3112 if (eq) str = SAME_NONZERO_PATTERN; 3113 } 3114 /* spgeam is buggy with one column */ 3115 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3116 3117 if (str == SUBSET_NONZERO_PATTERN) { 3118 cusparseStatus_t stat; 3119 PetscScalar b = 1.0; 3120 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3121 size_t bufferSize; 3122 void *buffer; 3123 #endif 3124 3125 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3126 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3127 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 3128 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3129 stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3130 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3131 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3132 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat); 3133 cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr); 3134 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3135 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3136 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3137 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3138 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat); 3139 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3140 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3141 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3142 cerr = cudaFree(buffer);CHKERRCUDA(cerr); 3143 #else 3144 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3145 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3146 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3147 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3148 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat); 3149 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3150 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3151 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3152 #endif 3153 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 3154 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3155 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3156 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3157 } else if (str == SAME_NONZERO_PATTERN) { 3158 cublasHandle_t cublasv2handle; 3159 cublasStatus_t berr; 3160 PetscBLASInt one = 1, bnz = 1; 3161 3162 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3163 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3164 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3165 ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 3166 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3167 berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr); 3168 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3169 ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 3170 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3171 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3172 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3173 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3174 } else { 3175 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3176 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3177 } 3178 PetscFunctionReturn(0); 3179 } 3180 3181 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 3182 { 3183 PetscErrorCode ierr; 3184 Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 3185 PetscScalar *ay; 3186 cudaError_t cerr; 3187 cublasHandle_t cublasv2handle; 3188 cublasStatus_t berr; 3189 PetscBLASInt one = 1, bnz = 1; 3190 3191 PetscFunctionBegin; 3192 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3193 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3194 ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr); 3195 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3196 berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr); 3197 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3198 ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr); 3199 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3200 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3201 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3202 PetscFunctionReturn(0); 3203 } 3204 3205 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3206 { 3207 PetscErrorCode ierr; 3208 PetscBool both = PETSC_FALSE; 3209 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3210 3211 PetscFunctionBegin; 3212 if (A->factortype == MAT_FACTOR_NONE) { 3213 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 3214 if (spptr->mat) { 3215 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 3216 if (matrix->values) { 3217 both = PETSC_TRUE; 3218 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3219 } 3220 } 3221 if (spptr->matTranspose) { 3222 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 3223 if (matrix->values) { 3224 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3225 } 3226 } 3227 } 3228 //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 3229 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 3230 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 3231 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3232 else A->offloadmask = PETSC_OFFLOAD_CPU; 3233 3234 PetscFunctionReturn(0); 3235 } 3236 3237 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3238 { 3239 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3240 PetscErrorCode ierr; 3241 3242 PetscFunctionBegin; 3243 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 3244 if (flg) { 3245 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3246 3247 A->ops->scale = MatScale_SeqAIJ; 3248 A->ops->axpy = MatAXPY_SeqAIJ; 3249 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3250 A->ops->mult = MatMult_SeqAIJ; 3251 A->ops->multadd = MatMultAdd_SeqAIJ; 3252 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3253 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3254 A->ops->multhermitiantranspose = NULL; 3255 A->ops->multhermitiantransposeadd = NULL; 3256 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3257 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3258 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3259 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3260 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3261 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3262 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 3263 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3264 } else { 3265 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3266 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3267 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3268 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3269 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3270 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3271 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3272 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3273 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3274 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3275 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3276 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3277 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3278 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3279 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3280 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3281 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3282 } 3283 A->boundtocpu = flg; 3284 a->inode.use = flg; 3285 PetscFunctionReturn(0); 3286 } 3287 3288 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3289 { 3290 PetscErrorCode ierr; 3291 cusparseStatus_t stat; 3292 Mat B; 3293 3294 PetscFunctionBegin; 3295 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 3296 if (reuse == MAT_INITIAL_MATRIX) { 3297 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 3298 } else if (reuse == MAT_REUSE_MATRIX) { 3299 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3300 } 3301 B = *newmat; 3302 3303 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 3304 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 3305 3306 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3307 if (B->factortype == MAT_FACTOR_NONE) { 3308 Mat_SeqAIJCUSPARSE *spptr; 3309 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3310 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3311 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3312 spptr->format = MAT_CUSPARSE_CSR; 3313 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3314 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3315 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3316 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3317 #endif 3318 B->spptr = spptr; 3319 } else { 3320 Mat_SeqAIJCUSPARSETriFactors *spptr; 3321 3322 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3323 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3324 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3325 B->spptr = spptr; 3326 } 3327 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3328 } 3329 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3330 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3331 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 3332 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3333 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3334 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3335 3336 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 3337 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3338 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 3339 PetscFunctionReturn(0); 3340 } 3341 3342 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3343 { 3344 PetscErrorCode ierr; 3345 3346 PetscFunctionBegin; 3347 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 3348 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 3349 PetscFunctionReturn(0); 3350 } 3351 3352 /*MC 3353 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3354 3355 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3356 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3357 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3358 3359 Options Database Keys: 3360 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3361 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3362 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3363 3364 Level: beginner 3365 3366 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3367 M*/ 3368 3369 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 3370 3371 3372 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3373 { 3374 PetscErrorCode ierr; 3375 3376 PetscFunctionBegin; 3377 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3378 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3379 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3380 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3381 PetscFunctionReturn(0); 3382 } 3383 3384 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3385 { 3386 PetscErrorCode ierr; 3387 cusparseStatus_t stat; 3388 3389 PetscFunctionBegin; 3390 if (*cusparsestruct) { 3391 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 3392 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 3393 delete (*cusparsestruct)->workVector; 3394 delete (*cusparsestruct)->rowoffsets_gpu; 3395 delete (*cusparsestruct)->cooPerm; 3396 delete (*cusparsestruct)->cooPerm_a; 3397 delete (*cusparsestruct)->csr2csc_i; 3398 if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 3399 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 3400 } 3401 PetscFunctionReturn(0); 3402 } 3403 3404 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3405 { 3406 PetscFunctionBegin; 3407 if (*mat) { 3408 delete (*mat)->values; 3409 delete (*mat)->column_indices; 3410 delete (*mat)->row_offsets; 3411 delete *mat; 3412 *mat = 0; 3413 } 3414 PetscFunctionReturn(0); 3415 } 3416 3417 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3418 { 3419 cusparseStatus_t stat; 3420 PetscErrorCode ierr; 3421 3422 PetscFunctionBegin; 3423 if (*trifactor) { 3424 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 3425 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 3426 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 3427 if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 3428 if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 3429 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3430 if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 3431 #endif 3432 ierr = PetscFree(*trifactor);CHKERRQ(ierr); 3433 } 3434 PetscFunctionReturn(0); 3435 } 3436 3437 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 3438 { 3439 CsrMatrix *mat; 3440 cusparseStatus_t stat; 3441 cudaError_t err; 3442 3443 PetscFunctionBegin; 3444 if (*matstruct) { 3445 if ((*matstruct)->mat) { 3446 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3447 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3448 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3449 #else 3450 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3451 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 3452 #endif 3453 } else { 3454 mat = (CsrMatrix*)(*matstruct)->mat; 3455 CsrMatrix_Destroy(&mat); 3456 } 3457 } 3458 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 3459 delete (*matstruct)->cprowIndices; 3460 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 3461 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 3462 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 3463 3464 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3465 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3466 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 3467 for (int i=0; i<3; i++) { 3468 if (mdata->cuSpMV[i].initialized) { 3469 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 3470 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 3471 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 3472 } 3473 } 3474 #endif 3475 delete *matstruct; 3476 *matstruct = NULL; 3477 } 3478 PetscFunctionReturn(0); 3479 } 3480 3481 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3482 { 3483 PetscErrorCode ierr; 3484 3485 PetscFunctionBegin; 3486 if (*trifactors) { 3487 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 3488 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 3489 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 3490 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 3491 delete (*trifactors)->rpermIndices; 3492 delete (*trifactors)->cpermIndices; 3493 delete (*trifactors)->workVector; 3494 (*trifactors)->rpermIndices = NULL; 3495 (*trifactors)->cpermIndices = NULL; 3496 (*trifactors)->workVector = NULL; 3497 } 3498 PetscFunctionReturn(0); 3499 } 3500 3501 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3502 { 3503 PetscErrorCode ierr; 3504 cusparseHandle_t handle; 3505 cusparseStatus_t stat; 3506 3507 PetscFunctionBegin; 3508 if (*trifactors) { 3509 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 3510 if (handle = (*trifactors)->handle) { 3511 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 3512 } 3513 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 3514 } 3515 PetscFunctionReturn(0); 3516 } 3517 3518 struct IJCompare 3519 { 3520 __host__ __device__ 3521 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3522 { 3523 if (t1.get<0>() < t2.get<0>()) return true; 3524 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3525 return false; 3526 } 3527 }; 3528 3529 struct IJEqual 3530 { 3531 __host__ __device__ 3532 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3533 { 3534 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3535 return true; 3536 } 3537 }; 3538 3539 struct IJDiff 3540 { 3541 __host__ __device__ 3542 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3543 { 3544 return t1 == t2 ? 0 : 1; 3545 } 3546 }; 3547 3548 struct IJSum 3549 { 3550 __host__ __device__ 3551 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3552 { 3553 return t1||t2; 3554 } 3555 }; 3556 3557 #include <thrust/iterator/discard_iterator.h> 3558 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 3559 { 3560 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3561 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3562 THRUSTARRAY *cooPerm_v = NULL; 3563 thrust::device_ptr<const PetscScalar> d_v; 3564 CsrMatrix *matrix; 3565 PetscErrorCode ierr; 3566 cudaError_t cerr; 3567 PetscInt n; 3568 3569 PetscFunctionBegin; 3570 if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 3571 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 3572 if (!cusp->cooPerm) { 3573 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3574 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3575 PetscFunctionReturn(0); 3576 } 3577 matrix = (CsrMatrix*)cusp->mat->mat; 3578 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3579 if (!v) { 3580 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3581 goto finalize; 3582 } 3583 n = cusp->cooPerm->size(); 3584 if (isCudaMem(v)) { 3585 d_v = thrust::device_pointer_cast(v); 3586 } else { 3587 cooPerm_v = new THRUSTARRAY(n); 3588 cooPerm_v->assign(v,v+n); 3589 d_v = cooPerm_v->data(); 3590 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 3591 } 3592 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3593 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3594 if (cusp->cooPerm_a) { 3595 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 3596 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3597 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3598 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3599 delete cooPerm_w; 3600 } else { 3601 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3602 matrix->values->begin())); 3603 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3604 matrix->values->end())); 3605 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); 3606 } 3607 } else { 3608 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 3609 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3610 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3611 } else { 3612 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3613 matrix->values->begin())); 3614 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3615 matrix->values->end())); 3616 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3617 } 3618 } 3619 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3620 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3621 finalize: 3622 delete cooPerm_v; 3623 A->offloadmask = PETSC_OFFLOAD_GPU; 3624 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3625 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3626 ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr); 3627 ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 3628 ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr); 3629 a->reallocs = 0; 3630 A->info.mallocs += 0; 3631 A->info.nz_unneeded = 0; 3632 A->assembled = A->was_assembled = PETSC_TRUE; 3633 A->num_ass++; 3634 PetscFunctionReturn(0); 3635 } 3636 3637 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3638 { 3639 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3640 PetscErrorCode ierr; 3641 3642 PetscFunctionBegin; 3643 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3644 if (!cusp) PetscFunctionReturn(0); 3645 if (destroy) { 3646 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3647 delete cusp->csr2csc_i; 3648 cusp->csr2csc_i = NULL; 3649 } 3650 A->transupdated = PETSC_FALSE; 3651 PetscFunctionReturn(0); 3652 } 3653 3654 #include <thrust/binary_search.h> 3655 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 3656 { 3657 PetscErrorCode ierr; 3658 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3659 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3660 PetscInt cooPerm_n, nzr = 0; 3661 cudaError_t cerr; 3662 3663 PetscFunctionBegin; 3664 ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 3665 ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 3666 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 3667 if (n != cooPerm_n) { 3668 delete cusp->cooPerm; 3669 delete cusp->cooPerm_a; 3670 cusp->cooPerm = NULL; 3671 cusp->cooPerm_a = NULL; 3672 } 3673 if (n) { 3674 THRUSTINTARRAY d_i(n); 3675 THRUSTINTARRAY d_j(n); 3676 THRUSTINTARRAY ii(A->rmap->n); 3677 3678 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 3679 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 3680 3681 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 3682 d_i.assign(coo_i,coo_i+n); 3683 d_j.assign(coo_j,coo_j+n); 3684 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 3685 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 3686 3687 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3688 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3689 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); 3690 *cusp->cooPerm_a = d_i; 3691 THRUSTINTARRAY w = d_j; 3692 3693 auto nekey = thrust::unique(fkey, ekey, IJEqual()); 3694 if (nekey == ekey) { /* all entries are unique */ 3695 delete cusp->cooPerm_a; 3696 cusp->cooPerm_a = NULL; 3697 } else { /* I couldn't come up with a more elegant algorithm */ 3698 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); 3699 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); 3700 (*cusp->cooPerm_a)[0] = 0; 3701 w[0] = 0; 3702 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); 3703 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); 3704 } 3705 thrust::counting_iterator<PetscInt> search_begin(0); 3706 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), 3707 search_begin, search_begin + A->rmap->n, 3708 ii.begin()); 3709 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3710 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3711 3712 ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 3713 a->singlemalloc = PETSC_FALSE; 3714 a->free_a = PETSC_TRUE; 3715 a->free_ij = PETSC_TRUE; 3716 ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 3717 a->i[0] = 0; 3718 cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3719 a->nz = a->maxnz = a->i[A->rmap->n]; 3720 a->rmax = 0; 3721 ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 3722 ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 3723 cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3724 if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 3725 if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 3726 for (PetscInt i = 0; i < A->rmap->n; i++) { 3727 const PetscInt nnzr = a->i[i+1] - a->i[i]; 3728 nzr += (PetscInt)!!(nnzr); 3729 a->ilen[i] = a->imax[i] = nnzr; 3730 a->rmax = PetscMax(a->rmax,nnzr); 3731 } 3732 a->nonzerorowcnt = nzr; 3733 A->preallocated = PETSC_TRUE; 3734 ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 3735 ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr); 3736 } else { 3737 ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 3738 } 3739 ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 3740 3741 /* We want to allocate the CUSPARSE struct for matvec now. 3742 The code is so convoluted now that I prefer to copy zeros */ 3743 ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr); 3744 ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 3745 A->offloadmask = PETSC_OFFLOAD_CPU; 3746 A->nonzerostate++; 3747 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3748 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 3749 3750 A->assembled = PETSC_FALSE; 3751 A->was_assembled = PETSC_FALSE; 3752 PetscFunctionReturn(0); 3753 } 3754 3755 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 3756 { 3757 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3758 CsrMatrix *csr; 3759 PetscErrorCode ierr; 3760 3761 PetscFunctionBegin; 3762 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3763 PetscValidPointer(a,2); 3764 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3765 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3766 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3767 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3768 csr = (CsrMatrix*)cusp->mat->mat; 3769 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3770 *a = csr->values->data().get(); 3771 PetscFunctionReturn(0); 3772 } 3773 3774 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 3775 { 3776 PetscFunctionBegin; 3777 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3778 PetscValidPointer(a,2); 3779 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3780 *a = NULL; 3781 PetscFunctionReturn(0); 3782 } 3783 3784 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 3785 { 3786 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3787 CsrMatrix *csr; 3788 PetscErrorCode ierr; 3789 3790 PetscFunctionBegin; 3791 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3792 PetscValidPointer(a,2); 3793 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3794 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3795 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3796 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3797 csr = (CsrMatrix*)cusp->mat->mat; 3798 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3799 *a = csr->values->data().get(); 3800 A->offloadmask = PETSC_OFFLOAD_GPU; 3801 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 3802 PetscFunctionReturn(0); 3803 } 3804 3805 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 3806 { 3807 PetscErrorCode ierr; 3808 3809 PetscFunctionBegin; 3810 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3811 PetscValidPointer(a,2); 3812 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3813 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3814 *a = NULL; 3815 PetscFunctionReturn(0); 3816 } 3817 3818 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 3819 { 3820 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3821 CsrMatrix *csr; 3822 PetscErrorCode ierr; 3823 3824 PetscFunctionBegin; 3825 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3826 PetscValidPointer(a,2); 3827 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3828 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3829 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3830 csr = (CsrMatrix*)cusp->mat->mat; 3831 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3832 *a = csr->values->data().get(); 3833 A->offloadmask = PETSC_OFFLOAD_GPU; 3834 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 3835 PetscFunctionReturn(0); 3836 } 3837 3838 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 3839 { 3840 PetscErrorCode ierr; 3841 3842 PetscFunctionBegin; 3843 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3844 PetscValidPointer(a,2); 3845 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3846 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3847 *a = NULL; 3848 PetscFunctionReturn(0); 3849 } 3850 3851 struct IJCompare4 3852 { 3853 __host__ __device__ 3854 inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 3855 { 3856 if (t1.get<0>() < t2.get<0>()) return true; 3857 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3858 return false; 3859 } 3860 }; 3861 3862 struct Shift 3863 { 3864 int _shift; 3865 3866 Shift(int shift) : _shift(shift) {} 3867 __host__ __device__ 3868 inline int operator() (const int &c) 3869 { 3870 return c + _shift; 3871 } 3872 }; 3873 3874 /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */ 3875 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 3876 { 3877 PetscErrorCode ierr; 3878 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 3879 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 3880 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 3881 CsrMatrix *Acsr,*Bcsr,*Ccsr; 3882 PetscInt Annz,Bnnz; 3883 cusparseStatus_t stat; 3884 PetscInt i,m,n,zero = 0; 3885 cudaError_t cerr; 3886 3887 PetscFunctionBegin; 3888 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3889 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 3890 PetscValidPointer(C,4); 3891 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3892 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 3893 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 3894 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 3895 if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3896 if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3897 if (reuse == MAT_INITIAL_MATRIX) { 3898 m = A->rmap->n; 3899 n = A->cmap->n + B->cmap->n; 3900 ierr = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr); 3901 ierr = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr); 3902 ierr = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3903 c = (Mat_SeqAIJ*)(*C)->data; 3904 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 3905 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3906 Ccsr = new CsrMatrix; 3907 Cmat->cprowIndices = NULL; 3908 c->compressedrow.use = PETSC_FALSE; 3909 c->compressedrow.nrows = 0; 3910 c->compressedrow.i = NULL; 3911 c->compressedrow.rindex = NULL; 3912 Ccusp->workVector = NULL; 3913 Ccusp->nrows = m; 3914 Ccusp->mat = Cmat; 3915 Ccusp->mat->mat = Ccsr; 3916 Ccsr->num_rows = m; 3917 Ccsr->num_cols = n; 3918 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 3919 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3920 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 3921 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 3922 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 3923 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 3924 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3925 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3926 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3927 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3928 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 3929 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 3930 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr); 3931 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3932 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3933 3934 Acsr = (CsrMatrix*)Acusp->mat->mat; 3935 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 3936 Annz = (PetscInt)Acsr->column_indices->size(); 3937 Bnnz = (PetscInt)Bcsr->column_indices->size(); 3938 c->nz = Annz + Bnnz; 3939 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 3940 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3941 Ccsr->values = new THRUSTARRAY(c->nz); 3942 Ccsr->num_entries = c->nz; 3943 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 3944 if (c->nz) { 3945 auto Acoo = new THRUSTINTARRAY32(Annz); 3946 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 3947 auto Ccoo = new THRUSTINTARRAY32(c->nz); 3948 THRUSTINTARRAY32 *Aroff,*Broff; 3949 3950 if (a->compressedrow.use) { /* need full row offset */ 3951 if (!Acusp->rowoffsets_gpu) { 3952 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 3953 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 3954 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3955 } 3956 Aroff = Acusp->rowoffsets_gpu; 3957 } else Aroff = Acsr->row_offsets; 3958 if (b->compressedrow.use) { /* need full row offset */ 3959 if (!Bcusp->rowoffsets_gpu) { 3960 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3961 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 3962 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3963 } 3964 Broff = Bcusp->rowoffsets_gpu; 3965 } else Broff = Bcsr->row_offsets; 3966 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3967 stat = cusparseXcsr2coo(Acusp->handle, 3968 Aroff->data().get(), 3969 Annz, 3970 m, 3971 Acoo->data().get(), 3972 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3973 stat = cusparseXcsr2coo(Bcusp->handle, 3974 Broff->data().get(), 3975 Bnnz, 3976 m, 3977 Bcoo->data().get(), 3978 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3979 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 3980 auto Aperm = thrust::make_constant_iterator(1); 3981 auto Bperm = thrust::make_constant_iterator(0); 3982 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 3983 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 3984 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 3985 #else 3986 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 3987 auto Bcib = Bcsr->column_indices->begin(); 3988 auto Bcie = Bcsr->column_indices->end(); 3989 thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 3990 #endif 3991 auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 3992 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 3993 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 3994 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 3995 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 3996 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 3997 auto p1 = Ccusp->cooPerm->begin(); 3998 auto p2 = Ccusp->cooPerm->begin(); 3999 thrust::advance(p2,Annz); 4000 PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 4001 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 4002 thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 4003 #endif 4004 auto cci = thrust::make_counting_iterator(zero); 4005 auto cce = thrust::make_counting_iterator(c->nz); 4006 #if 0 //Errors on SUMMIT cuda 11.1.0 4007 PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4008 #else 4009 auto pred = thrust::identity<int>(); 4010 PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 4011 PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 4012 #endif 4013 stat = cusparseXcoo2csr(Ccusp->handle, 4014 Ccoo->data().get(), 4015 c->nz, 4016 m, 4017 Ccsr->row_offsets->data().get(), 4018 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4019 cerr = WaitForCUDA();CHKERRCUDA(cerr); 4020 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4021 delete wPerm; 4022 delete Acoo; 4023 delete Bcoo; 4024 delete Ccoo; 4025 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4026 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4027 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4028 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4029 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4030 #endif 4031 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4032 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4033 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4034 CsrMatrix *CcsrT = new CsrMatrix; 4035 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4036 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4037 4038 (*C)->form_explicit_transpose = PETSC_TRUE; 4039 (*C)->transupdated = PETSC_TRUE; 4040 Ccusp->rowoffsets_gpu = NULL; 4041 CmatT->cprowIndices = NULL; 4042 CmatT->mat = CcsrT; 4043 CcsrT->num_rows = n; 4044 CcsrT->num_cols = m; 4045 CcsrT->num_entries = c->nz; 4046 4047 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4048 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4049 CcsrT->values = new THRUSTARRAY(c->nz); 4050 4051 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4052 auto rT = CcsrT->row_offsets->begin(); 4053 if (AT) { 4054 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4055 thrust::advance(rT,-1); 4056 } 4057 if (BT) { 4058 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4059 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4060 thrust::copy(titb,tite,rT); 4061 } 4062 auto cT = CcsrT->column_indices->begin(); 4063 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4064 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4065 auto vT = CcsrT->values->begin(); 4066 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4067 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4068 cerr = WaitForCUDA();CHKERRCUDA(cerr); 4069 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4070 4071 stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat); 4072 stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4073 stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4074 cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4075 cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4076 cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4077 cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4078 cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4079 cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4080 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4081 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4082 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4083 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4084 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4085 #endif 4086 Ccusp->matTranspose = CmatT; 4087 } 4088 } 4089 4090 c->singlemalloc = PETSC_FALSE; 4091 c->free_a = PETSC_TRUE; 4092 c->free_ij = PETSC_TRUE; 4093 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 4094 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 4095 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4096 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4097 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4098 ii = *Ccsr->row_offsets; 4099 jj = *Ccsr->column_indices; 4100 cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4101 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4102 } else { 4103 cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4104 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4105 } 4106 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 4107 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 4108 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 4109 c->maxnz = c->nz; 4110 c->nonzerorowcnt = 0; 4111 c->rmax = 0; 4112 for (i = 0; i < m; i++) { 4113 const PetscInt nn = c->i[i+1] - c->i[i]; 4114 c->ilen[i] = c->imax[i] = nn; 4115 c->nonzerorowcnt += (PetscInt)!!nn; 4116 c->rmax = PetscMax(c->rmax,nn); 4117 } 4118 ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr); 4119 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 4120 (*C)->nonzerostate++; 4121 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 4122 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 4123 Ccusp->nonzerostate = (*C)->nonzerostate; 4124 (*C)->preallocated = PETSC_TRUE; 4125 } else { 4126 if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n); 4127 c = (Mat_SeqAIJ*)(*C)->data; 4128 if (c->nz) { 4129 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4130 if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4131 if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4132 if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4133 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4134 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4135 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4136 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4137 Acsr = (CsrMatrix*)Acusp->mat->mat; 4138 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4139 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4140 if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size()); 4141 if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4142 if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4143 if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4144 if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4145 auto pmid = Ccusp->cooPerm->begin(); 4146 thrust::advance(pmid,Acsr->num_entries); 4147 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4148 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4149 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4150 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4151 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4152 thrust::for_each(zibait,zieait,VecCUDAEquals()); 4153 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4154 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4155 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4156 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4157 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4158 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr); 4159 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4160 if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4161 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4162 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4163 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4164 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4165 auto vT = CcsrT->values->begin(); 4166 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4167 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4168 (*C)->transupdated = PETSC_TRUE; 4169 } 4170 cerr = WaitForCUDA();CHKERRCUDA(cerr); 4171 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4172 } 4173 } 4174 ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr); 4175 (*C)->assembled = PETSC_TRUE; 4176 (*C)->was_assembled = PETSC_FALSE; 4177 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4178 PetscFunctionReturn(0); 4179 } 4180 4181 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4182 { 4183 PetscErrorCode ierr; 4184 bool dmem; 4185 const PetscScalar *av; 4186 cudaError_t cerr; 4187 4188 PetscFunctionBegin; 4189 dmem = isCudaMem(v); 4190 ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr); 4191 if (n && idx) { 4192 THRUSTINTARRAY widx(n); 4193 widx.assign(idx,idx+n); 4194 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4195 4196 THRUSTARRAY *w = NULL; 4197 thrust::device_ptr<PetscScalar> dv; 4198 if (dmem) { 4199 dv = thrust::device_pointer_cast(v); 4200 } else { 4201 w = new THRUSTARRAY(n); 4202 dv = w->data(); 4203 } 4204 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4205 4206 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4207 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4208 thrust::for_each(zibit,zieit,VecCUDAEquals()); 4209 if (w) { 4210 cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4211 } 4212 delete w; 4213 } else { 4214 cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4215 } 4216 if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); } 4217 ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr); 4218 PetscFunctionReturn(0); 4219 } 4220