1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library. 4 CUSPARSE changed completely with CUDA 11, so this functionality requires an older CUDA. 5 */ 6 #define PETSC_SKIP_SPINLOCK 7 #define PETSC_SKIP_CXX_COMPLEX_FIX 8 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 9 10 #include <petscconf.h> 11 #include <petscpkg_version.h> 12 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 13 #include <../src/mat/impls/sbaij/seq/sbaij.h> 14 #include <../src/vec/vec/impls/dvecimpl.h> 15 #include <petsc/private/vecimpl.h> 16 #undef VecType 17 18 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 19 20 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 21 22 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 23 24 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 25 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 26 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 27 28 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 29 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 30 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 31 32 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 33 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 34 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 35 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 36 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 37 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 38 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 39 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 40 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 41 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 42 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 43 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 44 45 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 46 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 47 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 48 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**); 49 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 50 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 51 52 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 53 { 54 cusparseStatus_t stat; 55 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 56 57 PetscFunctionBegin; 58 cusparsestruct->stream = stream; 59 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 60 PetscFunctionReturn(0); 61 } 62 63 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 64 { 65 cusparseStatus_t stat; 66 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 67 68 PetscFunctionBegin; 69 if (cusparsestruct->handle != handle) { 70 if (cusparsestruct->handle) { 71 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 72 } 73 cusparsestruct->handle = handle; 74 } 75 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 76 PetscFunctionReturn(0); 77 } 78 79 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 80 { 81 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 82 83 PetscFunctionBegin; 84 if (cusparsestruct->handle) cusparsestruct->handle = 0; 85 PetscFunctionReturn(0); 86 } 87 88 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 89 { 90 PetscFunctionBegin; 91 *type = MATSOLVERCUSPARSE; 92 PetscFunctionReturn(0); 93 } 94 95 /*MC 96 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 97 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 98 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 99 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 100 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 101 algorithms are not recommended. This class does NOT support direct solver operations. 102 103 Level: beginner 104 105 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 106 M*/ 107 108 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 109 { 110 PetscErrorCode ierr; 111 PetscInt n = A->rmap->n; 112 113 PetscFunctionBegin; 114 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 115 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 116 (*B)->factortype = ftype; 117 (*B)->useordering = PETSC_TRUE; 118 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 119 120 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 121 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 122 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 123 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 124 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 125 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 126 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 127 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 128 129 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 130 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 131 PetscFunctionReturn(0); 132 } 133 134 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 135 { 136 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 137 138 PetscFunctionBegin; 139 switch (op) { 140 case MAT_CUSPARSE_MULT: 141 cusparsestruct->format = format; 142 break; 143 case MAT_CUSPARSE_ALL: 144 cusparsestruct->format = format; 145 break; 146 default: 147 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 148 } 149 PetscFunctionReturn(0); 150 } 151 152 /*@ 153 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 154 operation. Only the MatMult operation can use different GPU storage formats 155 for MPIAIJCUSPARSE matrices. 156 Not Collective 157 158 Input Parameters: 159 + A - Matrix of type SEQAIJCUSPARSE 160 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 161 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 162 163 Output Parameter: 164 165 Level: intermediate 166 167 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 168 @*/ 169 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 170 { 171 PetscErrorCode ierr; 172 173 PetscFunctionBegin; 174 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 175 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 176 PetscFunctionReturn(0); 177 } 178 179 /*@ 180 MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose 181 182 Collective on mat 183 184 Input Parameters: 185 + A - Matrix of type SEQAIJCUSPARSE 186 - transgen - the boolean flag 187 188 Level: intermediate 189 190 .seealso: MATSEQAIJCUSPARSE 191 @*/ 192 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen) 193 { 194 PetscErrorCode ierr; 195 PetscBool flg; 196 197 PetscFunctionBegin; 198 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 199 ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 200 if (flg) { 201 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 202 203 if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 204 cusp->transgen = transgen; 205 if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */ 206 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 207 } 208 } 209 PetscFunctionReturn(0); 210 } 211 212 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 213 { 214 PetscErrorCode ierr; 215 MatCUSPARSEStorageFormat format; 216 PetscBool flg; 217 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 218 219 PetscFunctionBegin; 220 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 221 if (A->factortype == MAT_FACTOR_NONE) { 222 PetscBool transgen = cusparsestruct->transgen; 223 224 ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr); 225 if (flg) { 226 ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr); 227 } 228 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 229 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 230 if (flg) { 231 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 232 } 233 } 234 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 235 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 236 if (flg) { 237 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 238 } 239 ierr = PetscOptionsTail();CHKERRQ(ierr); 240 PetscFunctionReturn(0); 241 } 242 243 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 244 { 245 PetscErrorCode ierr; 246 247 PetscFunctionBegin; 248 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 249 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 250 PetscFunctionReturn(0); 251 } 252 253 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 254 { 255 PetscErrorCode ierr; 256 257 PetscFunctionBegin; 258 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 259 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 260 PetscFunctionReturn(0); 261 } 262 263 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 264 { 265 PetscErrorCode ierr; 266 267 PetscFunctionBegin; 268 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 269 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 270 PetscFunctionReturn(0); 271 } 272 273 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 274 { 275 PetscErrorCode ierr; 276 277 PetscFunctionBegin; 278 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 279 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 280 PetscFunctionReturn(0); 281 } 282 283 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 284 { 285 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 286 PetscInt n = A->rmap->n; 287 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 288 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 289 cusparseStatus_t stat; 290 const PetscInt *ai = a->i,*aj = a->j,*vi; 291 const MatScalar *aa = a->a,*v; 292 PetscInt *AiLo, *AjLo; 293 PetscScalar *AALo; 294 PetscInt i,nz, nzLower, offset, rowOffset; 295 PetscErrorCode ierr; 296 cudaError_t cerr; 297 298 PetscFunctionBegin; 299 if (!n) PetscFunctionReturn(0); 300 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 301 try { 302 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 303 nzLower=n+ai[n]-ai[1]; 304 305 /* Allocate Space for the lower triangular matrix */ 306 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 307 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 308 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 309 310 /* Fill the lower triangular matrix */ 311 AiLo[0] = (PetscInt) 0; 312 AiLo[n] = nzLower; 313 AjLo[0] = (PetscInt) 0; 314 AALo[0] = (MatScalar) 1.0; 315 v = aa; 316 vi = aj; 317 offset = 1; 318 rowOffset= 1; 319 for (i=1; i<n; i++) { 320 nz = ai[i+1] - ai[i]; 321 /* additional 1 for the term on the diagonal */ 322 AiLo[i] = rowOffset; 323 rowOffset += nz+1; 324 325 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 326 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 327 328 offset += nz; 329 AjLo[offset] = (PetscInt) i; 330 AALo[offset] = (MatScalar) 1.0; 331 offset += 1; 332 333 v += nz; 334 vi += nz; 335 } 336 337 /* allocate space for the triangular factor information */ 338 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 339 340 /* Create the matrix description */ 341 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 342 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 343 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 344 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 345 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 346 347 /* Create the solve analysis information */ 348 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 349 350 /* set the operation */ 351 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 352 353 /* set the matrix */ 354 loTriFactor->csrMat = new CsrMatrix; 355 loTriFactor->csrMat->num_rows = n; 356 loTriFactor->csrMat->num_cols = n; 357 loTriFactor->csrMat->num_entries = nzLower; 358 359 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 360 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 361 362 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 363 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 364 365 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 366 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 367 368 /* perform the solve analysis */ 369 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 370 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 371 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 372 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 373 374 /* assign the pointer. Is this really necessary? */ 375 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 376 377 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 378 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 379 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 380 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 381 } catch(char *ex) { 382 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 383 } 384 } 385 PetscFunctionReturn(0); 386 } 387 388 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 389 { 390 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 391 PetscInt n = A->rmap->n; 392 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 393 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 394 cusparseStatus_t stat; 395 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 396 const MatScalar *aa = a->a,*v; 397 PetscInt *AiUp, *AjUp; 398 PetscScalar *AAUp; 399 PetscInt i,nz, nzUpper, offset; 400 PetscErrorCode ierr; 401 cudaError_t cerr; 402 403 PetscFunctionBegin; 404 if (!n) PetscFunctionReturn(0); 405 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 406 try { 407 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 408 nzUpper = adiag[0]-adiag[n]; 409 410 /* Allocate Space for the upper triangular matrix */ 411 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 412 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 413 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 414 415 /* Fill the upper triangular matrix */ 416 AiUp[0]=(PetscInt) 0; 417 AiUp[n]=nzUpper; 418 offset = nzUpper; 419 for (i=n-1; i>=0; i--) { 420 v = aa + adiag[i+1] + 1; 421 vi = aj + adiag[i+1] + 1; 422 423 /* number of elements NOT on the diagonal */ 424 nz = adiag[i] - adiag[i+1]-1; 425 426 /* decrement the offset */ 427 offset -= (nz+1); 428 429 /* first, set the diagonal elements */ 430 AjUp[offset] = (PetscInt) i; 431 AAUp[offset] = (MatScalar)1./v[nz]; 432 AiUp[i] = AiUp[i+1] - (nz+1); 433 434 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 435 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 436 } 437 438 /* allocate space for the triangular factor information */ 439 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 440 441 /* Create the matrix description */ 442 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 443 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 444 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 445 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 446 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 447 448 /* Create the solve analysis information */ 449 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 450 451 /* set the operation */ 452 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 453 454 /* set the matrix */ 455 upTriFactor->csrMat = new CsrMatrix; 456 upTriFactor->csrMat->num_rows = n; 457 upTriFactor->csrMat->num_cols = n; 458 upTriFactor->csrMat->num_entries = nzUpper; 459 460 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 461 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 462 463 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 464 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 465 466 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 467 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 468 469 /* perform the solve analysis */ 470 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 471 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 472 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 473 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 474 475 /* assign the pointer. Is this really necessary? */ 476 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 477 478 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 479 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 480 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 481 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 482 } catch(char *ex) { 483 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 484 } 485 } 486 PetscFunctionReturn(0); 487 } 488 489 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 490 { 491 PetscErrorCode ierr; 492 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 493 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 494 IS isrow = a->row,iscol = a->icol; 495 PetscBool row_identity,col_identity; 496 const PetscInt *r,*c; 497 PetscInt n = A->rmap->n; 498 499 PetscFunctionBegin; 500 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 501 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 502 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 503 504 cusparseTriFactors->workVector = new THRUSTARRAY(n); 505 cusparseTriFactors->nnz=a->nz; 506 507 A->offloadmask = PETSC_OFFLOAD_BOTH; 508 /* lower triangular indices */ 509 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 510 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 511 if (!row_identity) { 512 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 513 cusparseTriFactors->rpermIndices->assign(r, r+n); 514 } 515 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 516 517 /* upper triangular indices */ 518 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 519 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 520 if (!col_identity) { 521 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 522 cusparseTriFactors->cpermIndices->assign(c, c+n); 523 } 524 525 if (!row_identity && !col_identity) { 526 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 527 } else if(!row_identity) { 528 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 529 } else if(!col_identity) { 530 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 531 } 532 533 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 534 PetscFunctionReturn(0); 535 } 536 537 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 538 { 539 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 540 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 541 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 542 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 543 cusparseStatus_t stat; 544 PetscErrorCode ierr; 545 cudaError_t cerr; 546 PetscInt *AiUp, *AjUp; 547 PetscScalar *AAUp; 548 PetscScalar *AALo; 549 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 550 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 551 const PetscInt *ai = b->i,*aj = b->j,*vj; 552 const MatScalar *aa = b->a,*v; 553 554 PetscFunctionBegin; 555 if (!n) PetscFunctionReturn(0); 556 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 557 try { 558 /* Allocate Space for the upper triangular matrix */ 559 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 560 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 561 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 562 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 563 564 /* Fill the upper triangular matrix */ 565 AiUp[0]=(PetscInt) 0; 566 AiUp[n]=nzUpper; 567 offset = 0; 568 for (i=0; i<n; i++) { 569 /* set the pointers */ 570 v = aa + ai[i]; 571 vj = aj + ai[i]; 572 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 573 574 /* first, set the diagonal elements */ 575 AjUp[offset] = (PetscInt) i; 576 AAUp[offset] = (MatScalar)1.0/v[nz]; 577 AiUp[i] = offset; 578 AALo[offset] = (MatScalar)1.0/v[nz]; 579 580 offset+=1; 581 if (nz>0) { 582 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 583 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 584 for (j=offset; j<offset+nz; j++) { 585 AAUp[j] = -AAUp[j]; 586 AALo[j] = AAUp[j]/v[nz]; 587 } 588 offset+=nz; 589 } 590 } 591 592 /* allocate space for the triangular factor information */ 593 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 594 595 /* Create the matrix description */ 596 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 597 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 598 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 599 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 600 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 601 602 /* Create the solve analysis information */ 603 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 604 605 /* set the operation */ 606 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 607 608 /* set the matrix */ 609 upTriFactor->csrMat = new CsrMatrix; 610 upTriFactor->csrMat->num_rows = A->rmap->n; 611 upTriFactor->csrMat->num_cols = A->cmap->n; 612 upTriFactor->csrMat->num_entries = a->nz; 613 614 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 615 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 616 617 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 618 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 619 620 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 621 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 622 623 /* perform the solve analysis */ 624 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 625 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 626 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 627 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 628 629 /* assign the pointer. Is this really necessary? */ 630 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 631 632 /* allocate space for the triangular factor information */ 633 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 634 635 /* Create the matrix description */ 636 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 637 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 638 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 639 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 640 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 641 642 /* Create the solve analysis information */ 643 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 644 645 /* set the operation */ 646 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 647 648 /* set the matrix */ 649 loTriFactor->csrMat = new CsrMatrix; 650 loTriFactor->csrMat->num_rows = A->rmap->n; 651 loTriFactor->csrMat->num_cols = A->cmap->n; 652 loTriFactor->csrMat->num_entries = a->nz; 653 654 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 655 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 656 657 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 658 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 659 660 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 661 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 662 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 663 664 /* perform the solve analysis */ 665 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 666 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 667 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 668 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 669 670 /* assign the pointer. Is this really necessary? */ 671 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 672 673 A->offloadmask = PETSC_OFFLOAD_BOTH; 674 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 675 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 676 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 677 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 678 } catch(char *ex) { 679 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 680 } 681 } 682 PetscFunctionReturn(0); 683 } 684 685 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 686 { 687 PetscErrorCode ierr; 688 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 689 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 690 IS ip = a->row; 691 const PetscInt *rip; 692 PetscBool perm_identity; 693 PetscInt n = A->rmap->n; 694 695 PetscFunctionBegin; 696 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 697 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 698 cusparseTriFactors->workVector = new THRUSTARRAY(n); 699 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 700 701 /* lower triangular indices */ 702 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 703 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 704 if (!perm_identity) { 705 IS iip; 706 const PetscInt *irip; 707 708 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 709 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 710 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 711 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 712 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 713 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 714 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 715 ierr = ISDestroy(&iip);CHKERRQ(ierr); 716 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 717 } 718 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 719 PetscFunctionReturn(0); 720 } 721 722 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 723 { 724 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 725 IS isrow = b->row,iscol = b->col; 726 PetscBool row_identity,col_identity; 727 PetscErrorCode ierr; 728 729 PetscFunctionBegin; 730 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 731 B->offloadmask = PETSC_OFFLOAD_CPU; 732 /* determine which version of MatSolve needs to be used. */ 733 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 734 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 735 if (row_identity && col_identity) { 736 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 737 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 738 B->ops->matsolve = NULL; 739 B->ops->matsolvetranspose = NULL; 740 } else { 741 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 742 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 743 B->ops->matsolve = NULL; 744 B->ops->matsolvetranspose = NULL; 745 } 746 747 /* get the triangular factors */ 748 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 749 PetscFunctionReturn(0); 750 } 751 752 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 753 { 754 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 755 IS ip = b->row; 756 PetscBool perm_identity; 757 PetscErrorCode ierr; 758 759 PetscFunctionBegin; 760 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 761 B->offloadmask = PETSC_OFFLOAD_CPU; 762 /* determine which version of MatSolve needs to be used. */ 763 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 764 if (perm_identity) { 765 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 766 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 767 B->ops->matsolve = NULL; 768 B->ops->matsolvetranspose = NULL; 769 } else { 770 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 771 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 772 B->ops->matsolve = NULL; 773 B->ops->matsolvetranspose = NULL; 774 } 775 776 /* get the triangular factors */ 777 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 778 PetscFunctionReturn(0); 779 } 780 781 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 782 { 783 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 784 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 785 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 786 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 787 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 788 cusparseStatus_t stat; 789 cusparseIndexBase_t indexBase; 790 cusparseMatrixType_t matrixType; 791 cusparseFillMode_t fillMode; 792 cusparseDiagType_t diagType; 793 794 PetscFunctionBegin; 795 796 /*********************************************/ 797 /* Now the Transpose of the Lower Tri Factor */ 798 /*********************************************/ 799 800 /* allocate space for the transpose of the lower triangular factor */ 801 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 802 803 /* set the matrix descriptors of the lower triangular factor */ 804 matrixType = cusparseGetMatType(loTriFactor->descr); 805 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 806 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 807 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 808 diagType = cusparseGetMatDiagType(loTriFactor->descr); 809 810 /* Create the matrix description */ 811 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 812 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 813 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 814 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 815 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 816 817 /* Create the solve analysis information */ 818 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 819 820 /* set the operation */ 821 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 822 823 /* allocate GPU space for the CSC of the lower triangular factor*/ 824 loTriFactorT->csrMat = new CsrMatrix; 825 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 826 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 827 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 828 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 829 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 830 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 831 832 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 833 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 834 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 835 loTriFactor->csrMat->values->data().get(), 836 loTriFactor->csrMat->row_offsets->data().get(), 837 loTriFactor->csrMat->column_indices->data().get(), 838 loTriFactorT->csrMat->values->data().get(), 839 loTriFactorT->csrMat->column_indices->data().get(), 840 loTriFactorT->csrMat->row_offsets->data().get(), 841 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 842 843 /* perform the solve analysis on the transposed matrix */ 844 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 845 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 846 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 847 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 848 loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 849 850 /* assign the pointer. Is this really necessary? */ 851 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 852 853 /*********************************************/ 854 /* Now the Transpose of the Upper Tri Factor */ 855 /*********************************************/ 856 857 /* allocate space for the transpose of the upper triangular factor */ 858 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 859 860 /* set the matrix descriptors of the upper triangular factor */ 861 matrixType = cusparseGetMatType(upTriFactor->descr); 862 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 863 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 864 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 865 diagType = cusparseGetMatDiagType(upTriFactor->descr); 866 867 /* Create the matrix description */ 868 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 869 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 870 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 871 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 872 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 873 874 /* Create the solve analysis information */ 875 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 876 877 /* set the operation */ 878 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 879 880 /* allocate GPU space for the CSC of the upper triangular factor*/ 881 upTriFactorT->csrMat = new CsrMatrix; 882 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 883 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 884 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 885 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 886 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 887 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 888 889 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 890 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 891 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 892 upTriFactor->csrMat->values->data().get(), 893 upTriFactor->csrMat->row_offsets->data().get(), 894 upTriFactor->csrMat->column_indices->data().get(), 895 upTriFactorT->csrMat->values->data().get(), 896 upTriFactorT->csrMat->column_indices->data().get(), 897 upTriFactorT->csrMat->row_offsets->data().get(), 898 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 899 900 /* perform the solve analysis on the transposed matrix */ 901 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 902 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 903 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 904 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 905 upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 906 907 /* assign the pointer. Is this really necessary? */ 908 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 909 PetscFunctionReturn(0); 910 } 911 912 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 913 { 914 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 915 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 916 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 917 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 918 cusparseStatus_t stat; 919 cusparseIndexBase_t indexBase; 920 cudaError_t err; 921 PetscErrorCode ierr; 922 923 PetscFunctionBegin; 924 if (!cusparsestruct->transgen || cusparsestruct->matTranspose) PetscFunctionReturn(0); 925 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 926 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 927 /* create cusparse matrix */ 928 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 929 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 930 indexBase = cusparseGetMatIndexBase(matstruct->descr); 931 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 932 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 933 934 /* set alpha and beta */ 935 err = cudaMalloc((void **)&(matstructT->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 936 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 937 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 938 err = cudaMemcpy(matstructT->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 939 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 940 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 941 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 942 943 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 944 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 945 CsrMatrix *matrixT= new CsrMatrix; 946 matrixT->num_rows = A->cmap->n; 947 matrixT->num_cols = A->rmap->n; 948 matrixT->num_entries = a->nz; 949 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 950 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 951 matrixT->values = new THRUSTARRAY(a->nz); 952 953 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 954 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 955 /* compute the transpose, i.e. the CSC */ 956 indexBase = cusparseGetMatIndexBase(matstruct->descr); 957 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 958 A->cmap->n, matrix->num_entries, 959 matrix->values->data().get(), 960 cusparsestruct->rowoffsets_gpu->data().get(), 961 matrix->column_indices->data().get(), 962 matrixT->values->data().get(), 963 matrixT->column_indices->data().get(), 964 matrixT->row_offsets->data().get(), 965 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 966 matstructT->mat = matrixT; 967 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 968 CsrMatrix *temp = new CsrMatrix; 969 CsrMatrix *tempT = new CsrMatrix; 970 971 /* First convert HYB to CSR */ 972 temp->num_rows = A->rmap->n; 973 temp->num_cols = A->cmap->n; 974 temp->num_entries = a->nz; 975 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 976 temp->column_indices = new THRUSTINTARRAY32(a->nz); 977 temp->values = new THRUSTARRAY(a->nz); 978 979 stat = cusparse_hyb2csr(cusparsestruct->handle, 980 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 981 temp->values->data().get(), 982 temp->row_offsets->data().get(), 983 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 984 985 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 986 tempT->num_rows = A->rmap->n; 987 tempT->num_cols = A->cmap->n; 988 tempT->num_entries = a->nz; 989 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 990 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 991 tempT->values = new THRUSTARRAY(a->nz); 992 993 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 994 temp->num_cols, temp->num_entries, 995 temp->values->data().get(), 996 temp->row_offsets->data().get(), 997 temp->column_indices->data().get(), 998 tempT->values->data().get(), 999 tempT->column_indices->data().get(), 1000 tempT->row_offsets->data().get(), 1001 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1002 1003 /* Last, convert CSC to HYB */ 1004 cusparseHybMat_t hybMat; 1005 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1006 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1007 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1008 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1009 matstructT->descr, tempT->values->data().get(), 1010 tempT->row_offsets->data().get(), 1011 tempT->column_indices->data().get(), 1012 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1013 1014 /* assign the pointer */ 1015 matstructT->mat = hybMat; 1016 /* delete temporaries */ 1017 if (tempT) { 1018 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1019 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1020 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1021 delete (CsrMatrix*) tempT; 1022 } 1023 if (temp) { 1024 if (temp->values) delete (THRUSTARRAY*) temp->values; 1025 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1026 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1027 delete (CsrMatrix*) temp; 1028 } 1029 } 1030 err = WaitForCUDA();CHKERRCUDA(err); 1031 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1032 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1033 /* the compressed row indices is not used for matTranspose */ 1034 matstructT->cprowIndices = NULL; 1035 /* assign the pointer */ 1036 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1037 PetscFunctionReturn(0); 1038 } 1039 1040 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1041 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1042 { 1043 PetscInt n = xx->map->n; 1044 const PetscScalar *barray; 1045 PetscScalar *xarray; 1046 thrust::device_ptr<const PetscScalar> bGPU; 1047 thrust::device_ptr<PetscScalar> xGPU; 1048 cusparseStatus_t stat; 1049 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1050 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1051 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1052 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1053 PetscErrorCode ierr; 1054 cudaError_t cerr; 1055 1056 PetscFunctionBegin; 1057 /* Analyze the matrix and create the transpose ... on the fly */ 1058 if (!loTriFactorT && !upTriFactorT) { 1059 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1060 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1061 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1062 } 1063 1064 /* Get the GPU pointers */ 1065 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1066 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1067 xGPU = thrust::device_pointer_cast(xarray); 1068 bGPU = thrust::device_pointer_cast(barray); 1069 1070 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1071 /* First, reorder with the row permutation */ 1072 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1073 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1074 xGPU); 1075 1076 /* First, solve U */ 1077 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1078 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1079 upTriFactorT->csrMat->values->data().get(), 1080 upTriFactorT->csrMat->row_offsets->data().get(), 1081 upTriFactorT->csrMat->column_indices->data().get(), 1082 upTriFactorT->solveInfo, 1083 xarray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1084 1085 /* Then, solve L */ 1086 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1087 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1088 loTriFactorT->csrMat->values->data().get(), 1089 loTriFactorT->csrMat->row_offsets->data().get(), 1090 loTriFactorT->csrMat->column_indices->data().get(), 1091 loTriFactorT->solveInfo, 1092 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1093 1094 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1095 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1096 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1097 tempGPU->begin()); 1098 1099 /* Copy the temporary to the full solution. */ 1100 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1101 1102 /* restore */ 1103 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1104 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1105 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1106 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1107 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1108 PetscFunctionReturn(0); 1109 } 1110 1111 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1112 { 1113 const PetscScalar *barray; 1114 PetscScalar *xarray; 1115 cusparseStatus_t stat; 1116 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1117 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1118 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1119 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1120 PetscErrorCode ierr; 1121 cudaError_t cerr; 1122 1123 PetscFunctionBegin; 1124 /* Analyze the matrix and create the transpose ... on the fly */ 1125 if (!loTriFactorT && !upTriFactorT) { 1126 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1127 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1128 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1129 } 1130 1131 /* Get the GPU pointers */ 1132 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1133 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1134 1135 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1136 /* First, solve U */ 1137 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1138 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1139 upTriFactorT->csrMat->values->data().get(), 1140 upTriFactorT->csrMat->row_offsets->data().get(), 1141 upTriFactorT->csrMat->column_indices->data().get(), 1142 upTriFactorT->solveInfo, 1143 barray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1144 1145 /* Then, solve L */ 1146 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1147 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1148 loTriFactorT->csrMat->values->data().get(), 1149 loTriFactorT->csrMat->row_offsets->data().get(), 1150 loTriFactorT->csrMat->column_indices->data().get(), 1151 loTriFactorT->solveInfo, 1152 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1153 1154 /* restore */ 1155 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1156 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1157 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1158 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1159 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1160 PetscFunctionReturn(0); 1161 } 1162 1163 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1164 { 1165 const PetscScalar *barray; 1166 PetscScalar *xarray; 1167 thrust::device_ptr<const PetscScalar> bGPU; 1168 thrust::device_ptr<PetscScalar> xGPU; 1169 cusparseStatus_t stat; 1170 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1171 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1172 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1173 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1174 PetscErrorCode ierr; 1175 cudaError_t cerr; 1176 1177 PetscFunctionBegin; 1178 1179 /* Get the GPU pointers */ 1180 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1181 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1182 xGPU = thrust::device_pointer_cast(xarray); 1183 bGPU = thrust::device_pointer_cast(barray); 1184 1185 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1186 /* First, reorder with the row permutation */ 1187 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1188 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1189 tempGPU->begin()); 1190 1191 /* Next, solve L */ 1192 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1193 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1194 loTriFactor->csrMat->values->data().get(), 1195 loTriFactor->csrMat->row_offsets->data().get(), 1196 loTriFactor->csrMat->column_indices->data().get(), 1197 loTriFactor->solveInfo, 1198 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1199 1200 /* Then, solve U */ 1201 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1202 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1203 upTriFactor->csrMat->values->data().get(), 1204 upTriFactor->csrMat->row_offsets->data().get(), 1205 upTriFactor->csrMat->column_indices->data().get(), 1206 upTriFactor->solveInfo, 1207 xarray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1208 1209 /* Last, reorder with the column permutation */ 1210 thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1211 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1212 xGPU); 1213 1214 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1215 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1216 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1217 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1218 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1219 PetscFunctionReturn(0); 1220 } 1221 1222 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1223 { 1224 const PetscScalar *barray; 1225 PetscScalar *xarray; 1226 cusparseStatus_t stat; 1227 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1228 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1229 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1230 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1231 PetscErrorCode ierr; 1232 cudaError_t cerr; 1233 1234 PetscFunctionBegin; 1235 /* Get the GPU pointers */ 1236 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1237 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1238 1239 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1240 /* First, solve L */ 1241 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1242 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1243 loTriFactor->csrMat->values->data().get(), 1244 loTriFactor->csrMat->row_offsets->data().get(), 1245 loTriFactor->csrMat->column_indices->data().get(), 1246 loTriFactor->solveInfo, 1247 barray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1248 1249 /* Next, solve U */ 1250 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1251 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1252 upTriFactor->csrMat->values->data().get(), 1253 upTriFactor->csrMat->row_offsets->data().get(), 1254 upTriFactor->csrMat->column_indices->data().get(), 1255 upTriFactor->solveInfo, 1256 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1257 1258 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1259 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1260 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1261 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1262 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1263 PetscFunctionReturn(0); 1264 } 1265 1266 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1267 { 1268 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1269 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1270 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1271 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1272 PetscErrorCode ierr; 1273 cusparseStatus_t stat; 1274 cudaError_t err; 1275 1276 PetscFunctionBegin; 1277 if (A->boundtocpu) PetscFunctionReturn(0); 1278 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1279 if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1280 /* Copy values only */ 1281 CsrMatrix *mat,*matT; 1282 mat = (CsrMatrix*)cusparsestruct->mat->mat; 1283 1284 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1285 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1286 mat->values->assign(a->a, a->a+a->nz); 1287 err = WaitForCUDA();CHKERRCUDA(err); 1288 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1289 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1290 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1291 1292 /* Update matT when it was built before */ 1293 if (cusparsestruct->matTranspose) { 1294 cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr); 1295 matT = (CsrMatrix*)cusparsestruct->matTranspose->mat; 1296 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1297 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1298 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1299 A->cmap->n, mat->num_entries, 1300 mat->values->data().get(), 1301 cusparsestruct->rowoffsets_gpu->data().get(), 1302 mat->column_indices->data().get(), 1303 matT->values->data().get(), 1304 matT->column_indices->data().get(), 1305 matT->row_offsets->data().get(), 1306 CUSPARSE_ACTION_NUMERIC,indexBase);CHKERRCUSPARSE(stat); 1307 err = WaitForCUDA();CHKERRCUDA(err); 1308 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1309 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1310 } 1311 } else { 1312 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1313 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1314 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1315 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr); 1316 delete cusparsestruct->workVector; 1317 delete cusparsestruct->rowoffsets_gpu; 1318 try { 1319 if (a->compressedrow.use) { 1320 m = a->compressedrow.nrows; 1321 ii = a->compressedrow.i; 1322 ridx = a->compressedrow.rindex; 1323 } else { 1324 m = A->rmap->n; 1325 ii = a->i; 1326 ridx = NULL; 1327 } 1328 cusparsestruct->nrows = m; 1329 1330 /* create cusparse matrix */ 1331 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1332 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1333 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1334 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1335 1336 err = cudaMalloc((void **)&(matstruct->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 1337 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1338 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1339 err = cudaMemcpy(matstruct->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1340 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1341 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1342 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1343 1344 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1345 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1346 /* set the matrix */ 1347 CsrMatrix *matrix= new CsrMatrix; 1348 matrix->num_rows = m; 1349 matrix->num_cols = A->cmap->n; 1350 matrix->num_entries = a->nz; 1351 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1352 matrix->row_offsets->assign(ii, ii + m+1); 1353 1354 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1355 matrix->column_indices->assign(a->j, a->j+a->nz); 1356 1357 matrix->values = new THRUSTARRAY(a->nz); 1358 matrix->values->assign(a->a, a->a+a->nz); 1359 1360 /* assign the pointer */ 1361 matstruct->mat = matrix; 1362 1363 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1364 CsrMatrix *matrix= new CsrMatrix; 1365 matrix->num_rows = m; 1366 matrix->num_cols = A->cmap->n; 1367 matrix->num_entries = a->nz; 1368 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1369 matrix->row_offsets->assign(ii, ii + m+1); 1370 1371 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1372 matrix->column_indices->assign(a->j, a->j+a->nz); 1373 1374 matrix->values = new THRUSTARRAY(a->nz); 1375 matrix->values->assign(a->a, a->a+a->nz); 1376 1377 cusparseHybMat_t hybMat; 1378 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1379 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1380 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1381 stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1382 matstruct->descr, matrix->values->data().get(), 1383 matrix->row_offsets->data().get(), 1384 matrix->column_indices->data().get(), 1385 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1386 /* assign the pointer */ 1387 matstruct->mat = hybMat; 1388 1389 if (matrix) { 1390 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1391 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1392 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1393 delete (CsrMatrix*)matrix; 1394 } 1395 } 1396 1397 /* assign the compressed row indices */ 1398 if (a->compressedrow.use) { 1399 cusparsestruct->workVector = new THRUSTARRAY(m); 1400 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1401 matstruct->cprowIndices->assign(ridx,ridx+m); 1402 tmp = m; 1403 } else { 1404 cusparsestruct->workVector = NULL; 1405 matstruct->cprowIndices = NULL; 1406 tmp = 0; 1407 } 1408 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1409 1410 /* assign the pointer */ 1411 cusparsestruct->mat = matstruct; 1412 } catch(char *ex) { 1413 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1414 } 1415 err = WaitForCUDA();CHKERRCUDA(err); 1416 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1417 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1418 cusparsestruct->nonzerostate = A->nonzerostate; 1419 } 1420 A->offloadmask = PETSC_OFFLOAD_BOTH; 1421 } 1422 PetscFunctionReturn(0); 1423 } 1424 1425 struct VecCUDAPlusEquals 1426 { 1427 template <typename Tuple> 1428 __host__ __device__ 1429 void operator()(Tuple t) 1430 { 1431 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1432 } 1433 }; 1434 1435 struct VecCUDAEqualsReverse 1436 { 1437 template <typename Tuple> 1438 __host__ __device__ 1439 void operator()(Tuple t) 1440 { 1441 thrust::get<0>(t) = thrust::get<1>(t); 1442 } 1443 }; 1444 1445 typedef struct { 1446 PetscBool cisdense; 1447 PetscScalar *Bt; 1448 Mat X; 1449 } MatMatCusparse; 1450 1451 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1452 { 1453 PetscErrorCode ierr; 1454 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1455 cudaError_t cerr; 1456 1457 PetscFunctionBegin; 1458 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1459 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1460 ierr = PetscFree(data);CHKERRQ(ierr); 1461 PetscFunctionReturn(0); 1462 } 1463 1464 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1465 1466 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1467 { 1468 Mat_Product *product = C->product; 1469 Mat A,B; 1470 PetscInt m,n,k,blda,clda; 1471 PetscBool flg,biscuda; 1472 Mat_SeqAIJCUSPARSE *cusp; 1473 cusparseStatus_t stat; 1474 cusparseOperation_t opA; 1475 cusparseMatDescr_t matA; 1476 const PetscScalar *barray; 1477 PetscScalar *carray; 1478 PetscErrorCode ierr; 1479 MatMatCusparse *mmdata; 1480 Mat_SeqAIJCUSPARSEMultStruct *mat; 1481 CsrMatrix *csrmat; 1482 cudaError_t cuer; 1483 1484 PetscFunctionBegin; 1485 MatCheckProduct(C,1); 1486 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 1487 mmdata = (MatMatCusparse*)product->data; 1488 A = product->A; 1489 B = product->B; 1490 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1491 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1492 /* currently CopyToGpu does not copy if the matrix is bound to CPU 1493 Instead of silently accepting the wrong answer, I prefer to raise the error */ 1494 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 1495 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1496 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1497 switch (product->type) { 1498 case MATPRODUCT_AB: 1499 case MATPRODUCT_PtAP: 1500 mat = cusp->mat; 1501 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1502 m = A->rmap->n; 1503 k = A->cmap->n; 1504 n = B->cmap->n; 1505 break; 1506 case MATPRODUCT_AtB: 1507 if (!cusp->transgen) { 1508 mat = cusp->mat; 1509 opA = CUSPARSE_OPERATION_TRANSPOSE; 1510 } else { 1511 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1512 mat = cusp->matTranspose; 1513 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1514 } 1515 m = A->cmap->n; 1516 k = A->rmap->n; 1517 n = B->cmap->n; 1518 break; 1519 case MATPRODUCT_ABt: 1520 case MATPRODUCT_RARt: 1521 mat = cusp->mat; 1522 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1523 m = A->rmap->n; 1524 k = B->cmap->n; 1525 n = B->rmap->n; 1526 break; 1527 default: 1528 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1529 } 1530 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 1531 matA = mat->descr; 1532 csrmat = (CsrMatrix*)mat->mat; 1533 /* if the user passed a CPU matrix, copy the data to the GPU */ 1534 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 1535 if (!biscuda) { 1536 ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 1537 } 1538 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 1539 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 1540 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 1541 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1542 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 1543 } else { 1544 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 1545 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 1546 } 1547 1548 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1549 1550 /* cusparse spmm does not support transpose on B */ 1551 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 1552 cublasHandle_t cublasv2handle; 1553 cublasStatus_t cerr; 1554 1555 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 1556 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 1557 B->cmap->n,B->rmap->n, 1558 &PETSC_CUSPARSE_ONE ,barray,blda, 1559 &PETSC_CUSPARSE_ZERO,barray,blda, 1560 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 1561 blda = B->cmap->n; 1562 } 1563 1564 /* perform the MatMat operation */ 1565 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 1566 csrmat->num_entries,mat->alpha,matA, 1567 csrmat->values->data().get(), 1568 csrmat->row_offsets->data().get(), 1569 csrmat->column_indices->data().get(), 1570 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 1571 carray,clda);CHKERRCUSPARSE(stat); 1572 cuer = WaitForCUDA();CHKERRCUDA(cuer); 1573 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1574 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 1575 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 1576 if (product->type == MATPRODUCT_RARt) { 1577 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1578 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 1579 } else if (product->type == MATPRODUCT_PtAP) { 1580 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1581 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 1582 } else { 1583 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 1584 } 1585 if (mmdata->cisdense) { 1586 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 1587 } 1588 if (!biscuda) { 1589 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 1590 } 1591 PetscFunctionReturn(0); 1592 } 1593 1594 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1595 { 1596 Mat_Product *product = C->product; 1597 Mat A,B; 1598 PetscInt m,n; 1599 PetscBool cisdense,flg; 1600 PetscErrorCode ierr; 1601 MatMatCusparse *mmdata; 1602 Mat_SeqAIJCUSPARSE *cusp; 1603 1604 PetscFunctionBegin; 1605 MatCheckProduct(C,1); 1606 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 1607 A = product->A; 1608 B = product->B; 1609 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1610 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1611 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1612 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 1613 switch (product->type) { 1614 case MATPRODUCT_AB: 1615 m = A->rmap->n; 1616 n = B->cmap->n; 1617 break; 1618 case MATPRODUCT_AtB: 1619 m = A->cmap->n; 1620 n = B->cmap->n; 1621 break; 1622 case MATPRODUCT_ABt: 1623 m = A->rmap->n; 1624 n = B->rmap->n; 1625 break; 1626 case MATPRODUCT_PtAP: 1627 m = B->cmap->n; 1628 n = B->cmap->n; 1629 break; 1630 case MATPRODUCT_RARt: 1631 m = B->rmap->n; 1632 n = B->rmap->n; 1633 break; 1634 default: 1635 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1636 } 1637 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 1638 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 1639 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 1640 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 1641 1642 /* product data */ 1643 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 1644 mmdata->cisdense = cisdense; 1645 /* cusparse spmm does not support transpose on B */ 1646 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 1647 cudaError_t cerr; 1648 1649 cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 1650 } 1651 /* for these products we need intermediate storage */ 1652 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 1653 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 1654 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 1655 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 1656 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 1657 } else { 1658 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 1659 } 1660 } 1661 C->product->data = mmdata; 1662 C->product->destroy = MatDestroy_MatMatCusparse; 1663 1664 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 1665 PetscFunctionReturn(0); 1666 } 1667 1668 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 1669 1670 /* handles dense B */ 1671 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C) 1672 { 1673 Mat_Product *product = C->product; 1674 PetscErrorCode ierr; 1675 1676 PetscFunctionBegin; 1677 MatCheckProduct(C,1); 1678 if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A"); 1679 if (product->A->boundtocpu) { 1680 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr); 1681 PetscFunctionReturn(0); 1682 } 1683 switch (product->type) { 1684 case MATPRODUCT_AB: 1685 case MATPRODUCT_AtB: 1686 case MATPRODUCT_ABt: 1687 case MATPRODUCT_PtAP: 1688 case MATPRODUCT_RARt: 1689 C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 1690 default: 1691 break; 1692 } 1693 PetscFunctionReturn(0); 1694 } 1695 1696 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1697 { 1698 PetscErrorCode ierr; 1699 1700 PetscFunctionBegin; 1701 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 1702 PetscFunctionReturn(0); 1703 } 1704 1705 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 1706 { 1707 PetscErrorCode ierr; 1708 1709 PetscFunctionBegin; 1710 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 1711 PetscFunctionReturn(0); 1712 } 1713 1714 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1715 { 1716 PetscErrorCode ierr; 1717 1718 PetscFunctionBegin; 1719 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 1720 PetscFunctionReturn(0); 1721 } 1722 1723 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1724 { 1725 PetscErrorCode ierr; 1726 1727 PetscFunctionBegin; 1728 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 1729 PetscFunctionReturn(0); 1730 } 1731 1732 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1733 { 1734 PetscErrorCode ierr; 1735 1736 PetscFunctionBegin; 1737 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 1738 PetscFunctionReturn(0); 1739 } 1740 1741 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 1742 { 1743 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1744 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1745 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 1746 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 1747 PetscErrorCode ierr; 1748 cudaError_t cerr; 1749 cusparseStatus_t stat; 1750 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1751 PetscBool compressed; 1752 1753 PetscFunctionBegin; 1754 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported"); 1755 if (!a->nonzerorowcnt) { 1756 if (!yy) { 1757 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 1758 } 1759 PetscFunctionReturn(0); 1760 } 1761 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1762 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1763 if (!trans) { 1764 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1765 } else { 1766 if (herm || !cusparsestruct->transgen) { 1767 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 1768 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1769 } else { 1770 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1771 if (!matstruct) { 1772 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1773 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1774 } 1775 } 1776 } 1777 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 1778 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 1779 1780 try { 1781 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 1782 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 1783 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 1784 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1785 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 1786 xptr = xarray; 1787 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; /* Use a work vector for SpMv output if compressed */ 1788 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 1789 } else { 1790 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; /* Use a work vector for SpMv input if compressed */ 1791 dptr = zarray; 1792 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 1793 1794 if (compressed) { 1795 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 1796 1797 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 1798 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 1799 VecCUDAEqualsReverse()); 1800 } 1801 } 1802 1803 /* csr_spmv does y = alpha*Ax + beta*y */ 1804 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1805 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1806 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 1807 mat->num_rows, mat->num_cols, 1808 mat->num_entries, matstruct->alpha, matstruct->descr, 1809 mat->values->data().get(), mat->row_offsets->data().get(), 1810 mat->column_indices->data().get(), xptr, beta, 1811 dptr);CHKERRCUSPARSE(stat); 1812 } else { 1813 if (cusparsestruct->nrows) { 1814 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1815 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 1816 matstruct->alpha, matstruct->descr, hybMat, 1817 xptr, beta, 1818 dptr);CHKERRCUSPARSE(stat); 1819 } 1820 } 1821 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1822 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1823 1824 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 1825 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 1826 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 1827 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 1828 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 1829 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 1830 } 1831 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 1832 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 1833 } 1834 1835 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 1836 if (compressed) { 1837 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 1838 1839 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1840 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 1841 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 1842 VecCUDAPlusEquals()); 1843 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1844 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1845 } 1846 } else { 1847 if (yy && yy != zz) { 1848 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 1849 } 1850 } 1851 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 1852 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 1853 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 1854 } catch(char *ex) { 1855 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1856 } 1857 if (yy) { 1858 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1859 } else { 1860 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 1861 } 1862 PetscFunctionReturn(0); 1863 } 1864 1865 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1866 { 1867 PetscErrorCode ierr; 1868 1869 PetscFunctionBegin; 1870 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 1871 PetscFunctionReturn(0); 1872 } 1873 1874 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1875 { 1876 PetscErrorCode ierr; 1877 1878 PetscFunctionBegin; 1879 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1880 if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 1881 if (A->factortype == MAT_FACTOR_NONE) { 1882 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1883 } 1884 PetscFunctionReturn(0); 1885 } 1886 1887 /* --------------------------------------------------------------------------------*/ 1888 /*@ 1889 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1890 (the default parallel PETSc format). This matrix will ultimately pushed down 1891 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1892 assembly performance the user should preallocate the matrix storage by setting 1893 the parameter nz (or the array nnz). By setting these parameters accurately, 1894 performance during matrix assembly can be increased by more than a factor of 50. 1895 1896 Collective 1897 1898 Input Parameters: 1899 + comm - MPI communicator, set to PETSC_COMM_SELF 1900 . m - number of rows 1901 . n - number of columns 1902 . nz - number of nonzeros per row (same for all rows) 1903 - nnz - array containing the number of nonzeros in the various rows 1904 (possibly different for each row) or NULL 1905 1906 Output Parameter: 1907 . A - the matrix 1908 1909 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1910 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1911 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1912 1913 Notes: 1914 If nnz is given then nz is ignored 1915 1916 The AIJ format (also called the Yale sparse matrix format or 1917 compressed row storage), is fully compatible with standard Fortran 77 1918 storage. That is, the stored row and column indices can begin at 1919 either one (as in Fortran) or zero. See the users' manual for details. 1920 1921 Specify the preallocated storage with either nz or nnz (not both). 1922 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1923 allocation. For large problems you MUST preallocate memory or you 1924 will get TERRIBLE performance, see the users' manual chapter on matrices. 1925 1926 By default, this format uses inodes (identical nodes) when possible, to 1927 improve numerical efficiency of matrix-vector products and solves. We 1928 search for consecutive rows with the same nonzero structure, thereby 1929 reusing matrix information to achieve increased efficiency. 1930 1931 Level: intermediate 1932 1933 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1934 @*/ 1935 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1936 { 1937 PetscErrorCode ierr; 1938 1939 PetscFunctionBegin; 1940 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1941 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1942 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1943 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1944 PetscFunctionReturn(0); 1945 } 1946 1947 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1948 { 1949 PetscErrorCode ierr; 1950 1951 PetscFunctionBegin; 1952 if (A->factortype == MAT_FACTOR_NONE) { 1953 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 1954 } else { 1955 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 1956 } 1957 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 1958 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 1959 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 1960 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 1961 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1962 PetscFunctionReturn(0); 1963 } 1964 1965 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 1966 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 1967 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 1968 { 1969 PetscErrorCode ierr; 1970 1971 PetscFunctionBegin; 1972 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 1973 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 1974 PetscFunctionReturn(0); 1975 } 1976 1977 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 1978 { 1979 PetscErrorCode ierr; 1980 1981 PetscFunctionBegin; 1982 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 1983 /* Currently, there is no case in which an AIJCUSPARSE matrix ever has its offloadmask set to PETS_OFFLOAD_GPU. 1984 If this changes, we need to implement a routine to update the CPU (host) version of the matrix from the GPU one. 1985 Right now, for safety we simply check for PETSC_OFFLOAD_GPU and have MatBindToCPU() error in this case. 1986 TODO: Add MatAIJCUSPARSECopyFromGPU() and make MatBindToCPU() functional for AIJCUSPARSE matries; 1987 can follow the example of MatBindToCPU_SeqAIJViennaCL(). */ 1988 if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"PETSC_OFFLOAD_GPU should not happen. Please report your use case to petsc-dev@mcs.anl.gov"); 1989 /* TODO: add support for this? */ 1990 if (flg) { 1991 A->ops->mult = MatMult_SeqAIJ; 1992 A->ops->multadd = MatMultAdd_SeqAIJ; 1993 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 1994 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 1995 A->ops->multhermitiantranspose = NULL; 1996 A->ops->multhermitiantransposeadd = NULL; 1997 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 1998 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 1999 } else { 2000 A->ops->mult = MatMult_SeqAIJCUSPARSE; 2001 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 2002 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 2003 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 2004 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 2005 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 2006 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 2007 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 2008 } 2009 A->boundtocpu = flg; 2010 PetscFunctionReturn(0); 2011 } 2012 2013 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 2014 { 2015 PetscErrorCode ierr; 2016 cusparseStatus_t stat; 2017 Mat B; 2018 2019 PetscFunctionBegin; 2020 if (reuse == MAT_INITIAL_MATRIX) { 2021 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 2022 } else if (reuse == MAT_REUSE_MATRIX) { 2023 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 2024 } 2025 B = *newmat; 2026 2027 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 2028 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 2029 2030 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 2031 if (B->factortype == MAT_FACTOR_NONE) { 2032 Mat_SeqAIJCUSPARSE *spptr; 2033 2034 ierr = PetscNew(&spptr);CHKERRQ(ierr); 2035 spptr->format = MAT_CUSPARSE_CSR; 2036 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 2037 B->spptr = spptr; 2038 } else { 2039 Mat_SeqAIJCUSPARSETriFactors *spptr; 2040 2041 ierr = PetscNew(&spptr);CHKERRQ(ierr); 2042 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 2043 B->spptr = spptr; 2044 } 2045 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2046 } 2047 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 2048 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 2049 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 2050 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 2051 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 2052 2053 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 2054 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2055 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 2056 PetscFunctionReturn(0); 2057 } 2058 2059 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 2060 { 2061 PetscErrorCode ierr; 2062 2063 PetscFunctionBegin; 2064 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); 2065 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 2066 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2067 PetscFunctionReturn(0); 2068 } 2069 2070 /*MC 2071 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 2072 2073 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 2074 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 2075 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 2076 2077 Options Database Keys: 2078 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 2079 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2080 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2081 2082 Level: beginner 2083 2084 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 2085 M*/ 2086 2087 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 2088 2089 2090 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 2091 { 2092 PetscErrorCode ierr; 2093 2094 PetscFunctionBegin; 2095 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2096 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2097 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2098 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2099 PetscFunctionReturn(0); 2100 } 2101 2102 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 2103 { 2104 PetscErrorCode ierr; 2105 cusparseStatus_t stat; 2106 cusparseHandle_t handle; 2107 2108 PetscFunctionBegin; 2109 if (*cusparsestruct) { 2110 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 2111 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 2112 delete (*cusparsestruct)->workVector; 2113 delete (*cusparsestruct)->rowoffsets_gpu; 2114 if (handle = (*cusparsestruct)->handle) { 2115 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 2116 } 2117 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 2118 } 2119 PetscFunctionReturn(0); 2120 } 2121 2122 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 2123 { 2124 PetscFunctionBegin; 2125 if (*mat) { 2126 delete (*mat)->values; 2127 delete (*mat)->column_indices; 2128 delete (*mat)->row_offsets; 2129 delete *mat; 2130 *mat = 0; 2131 } 2132 PetscFunctionReturn(0); 2133 } 2134 2135 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 2136 { 2137 cusparseStatus_t stat; 2138 PetscErrorCode ierr; 2139 2140 PetscFunctionBegin; 2141 if (*trifactor) { 2142 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 2143 if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 2144 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 2145 delete *trifactor; 2146 *trifactor = 0; 2147 } 2148 PetscFunctionReturn(0); 2149 } 2150 2151 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 2152 { 2153 CsrMatrix *mat; 2154 cusparseStatus_t stat; 2155 cudaError_t err; 2156 2157 PetscFunctionBegin; 2158 if (*matstruct) { 2159 if ((*matstruct)->mat) { 2160 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 2161 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 2162 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 2163 } else { 2164 mat = (CsrMatrix*)(*matstruct)->mat; 2165 CsrMatrix_Destroy(&mat); 2166 } 2167 } 2168 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 2169 delete (*matstruct)->cprowIndices; 2170 if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); } 2171 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 2172 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 2173 delete *matstruct; 2174 *matstruct = 0; 2175 } 2176 PetscFunctionReturn(0); 2177 } 2178 2179 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors) 2180 { 2181 PetscErrorCode ierr; 2182 2183 PetscFunctionBegin; 2184 if (*trifactors) { 2185 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 2186 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 2187 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 2188 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 2189 delete (*trifactors)->rpermIndices; 2190 delete (*trifactors)->cpermIndices; 2191 delete (*trifactors)->workVector; 2192 (*trifactors)->rpermIndices = 0; 2193 (*trifactors)->cpermIndices = 0; 2194 (*trifactors)->workVector = 0; 2195 } 2196 PetscFunctionReturn(0); 2197 } 2198 2199 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 2200 { 2201 PetscErrorCode ierr; 2202 cusparseHandle_t handle; 2203 cusparseStatus_t stat; 2204 2205 PetscFunctionBegin; 2206 if (*trifactors) { 2207 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 2208 if (handle = (*trifactors)->handle) { 2209 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 2210 } 2211 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 2212 } 2213 PetscFunctionReturn(0); 2214 } 2215 #else 2216 2217 /* The following stubs are only provided to satisfy the linker */ 2218 2219 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen) { 2220 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE in CUDA 11 is currently not supported!"); 2221 } 2222 2223 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 2224 { 2225 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE in CUDA 11 is currently not supported!"); 2226 } 2227 2228 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 2229 { 2230 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE in CUDA 11 is currently not supported!"); 2231 } 2232 2233 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 2234 { 2235 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE in CUDA 11 is currently not supported!"); 2236 } 2237 #endif 2238