1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format. 4 */ 5 6 #include "petscconf.h" 7 PETSC_CUDA_EXTERN_C_BEGIN 8 #include "../src/mat/impls/aij/seq/aij.h" /*I "petscmat.h" I*/ 9 //#include "petscbt.h" 10 #include "../src/vec/vec/impls/dvecimpl.h" 11 #include "petsc-private/vecimpl.h" 12 PETSC_CUDA_EXTERN_C_END 13 #undef VecType 14 #include "cusparsematimpl.h" 15 const char * const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 16 17 /* this is such a hack ... but I don't know of another way to pass this variable 18 from one GPU_Matrix_Ifc class to another. This is necessary for the parallel 19 SpMV. Essentially, I need to use the same stream variable in two different 20 data structures. I do this by creating a single instance of that stream 21 and reuse it. */ 22 cudaStream_t theBodyStream=0; 23 24 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 25 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 26 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo *); 27 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 28 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 29 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat); 30 PetscErrorCode MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(Mat); 31 PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 32 PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 33 PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 34 PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 35 36 #undef __FUNCT__ 37 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse" 38 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type) 39 { 40 PetscFunctionBegin; 41 *type = MATSOLVERCUSPARSE; 42 PetscFunctionReturn(0); 43 } 44 45 EXTERN_C_BEGIN 46 extern PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*); 47 EXTERN_C_END 48 /* 49 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers (ILU) for seq matrices 50 on the GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp 51 52 Level: beginner 53 */ 54 55 EXTERN_C_BEGIN 56 #undef __FUNCT__ 57 #define __FUNCT__ "MatGetFactor_seqaij_cusparse" 58 PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B) 59 { 60 PetscErrorCode ierr; 61 62 PetscFunctionBegin; 63 ierr = MatGetFactor_seqaij_petsc(A,ftype,B);CHKERRQ(ierr); 64 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 65 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 66 ierr = MatSetFromOptions_SeqAIJCUSPARSE(*B);CHKERRQ(ierr); 67 ierr = PetscObjectComposeFunctionDynamic((PetscObject)(*B),"MatFactorGetSolverPackage_C","MatFactorGetSolverPackage_seqaij_cusparse",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr); 68 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 69 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 70 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 71 (*B)->factortype = ftype; 72 PetscFunctionReturn(0); 73 } 74 EXTERN_C_END 75 76 EXTERN_C_BEGIN 77 #undef __FUNCT__ 78 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE" 79 PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 80 { 81 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 82 83 PetscFunctionBegin; 84 switch (op) { 85 case MAT_CUSPARSE_MULT: 86 cusparseMat->format = format; 87 break; 88 case MAT_CUSPARSE_SOLVE: 89 cusparseMatSolveStorageFormat = format; 90 break; 91 case MAT_CUSPARSE_ALL: 92 cusparseMat->format = format; 93 cusparseMatSolveStorageFormat = format; 94 break; 95 default: 96 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL are currently supported.",op); 97 } 98 PetscFunctionReturn(0); 99 } 100 EXTERN_C_END 101 102 103 /*@ 104 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 105 operation. Only the MatMult operation can use different GPU storage formats 106 for AIJCUSPARSE matrices. This requires the txpetscgpu package. Use --download-txpetscgpu 107 to build/install PETSc to use this package. 108 109 Not Collective 110 111 Input Parameters: 112 + A : Matrix of type SEQAIJCUSPARSE 113 . op : MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 114 - format : MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB) 115 116 Output Parameter: 117 118 Level: intermediate 119 120 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEARSEFormatOperation 121 @*/ 122 #undef __FUNCT__ 123 #define __FUNCT__ "MatCUSPARSESetFormat" 124 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 125 { 126 PetscErrorCode ierr; 127 128 PetscFunctionBegin; 129 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 130 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 131 PetscFunctionReturn(0); 132 } 133 134 #undef __FUNCT__ 135 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE" 136 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A) 137 { 138 PetscErrorCode ierr; 139 MatCUSPARSEStorageFormat format; 140 PetscBool flg; 141 142 PetscFunctionBegin; 143 ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr); 144 ierr = PetscObjectOptionsBegin((PetscObject)A); 145 if (A->factortype==MAT_FACTOR_NONE) { 146 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 147 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 148 if (flg) { 149 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 150 } 151 } 152 else { 153 ierr = PetscOptionsEnum("-mat_cusparse_solve_storage_format","sets storage format of (seq)aijcusparse gpu matrices for TriSolve", 154 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 155 if (flg) { 156 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_SOLVE,format);CHKERRQ(ierr); 157 } 158 } 159 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 160 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 161 if (flg) { 162 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 163 } 164 ierr = PetscOptionsEnd();CHKERRQ(ierr); 165 PetscFunctionReturn(0); 166 167 } 168 169 #undef __FUNCT__ 170 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE" 171 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 172 { 173 PetscErrorCode ierr; 174 175 PetscFunctionBegin; 176 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 177 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 178 PetscFunctionReturn(0); 179 } 180 181 #undef __FUNCT__ 182 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE" 183 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 184 { 185 PetscErrorCode ierr; 186 187 PetscFunctionBegin; 188 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 189 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 190 PetscFunctionReturn(0); 191 } 192 193 #undef __FUNCT__ 194 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildLowerTriMatrix" 195 PetscErrorCode MatSeqAIJCUSPARSEBuildLowerTriMatrix(Mat A) 196 { 197 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 198 PetscInt n = A->rmap->n; 199 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 200 GPU_Matrix_Ifc* cusparseMat = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 201 cusparseStatus_t stat; 202 const PetscInt *ai = a->i,*aj = a->j,*vi; 203 const MatScalar *aa = a->a,*v; 204 PetscErrorCode ierr; 205 PetscInt *AiLo, *AjLo; 206 PetscScalar *AALo; 207 PetscInt i,nz, nzLower, offset, rowOffset; 208 209 PetscFunctionBegin; 210 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 211 try { 212 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 213 nzLower=n+ai[n]-ai[1]; 214 215 /* Allocate Space for the lower triangular matrix */ 216 ierr = cudaMallocHost((void **) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 217 ierr = cudaMallocHost((void **) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr); 218 ierr = cudaMallocHost((void **) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr); 219 220 /* Fill the lower triangular matrix */ 221 AiLo[0]=(PetscInt) 0; 222 AiLo[n]=nzLower; 223 AjLo[0]=(PetscInt) 0; 224 AALo[0]=(MatScalar) 1.0; 225 v = aa; 226 vi = aj; 227 offset=1; 228 rowOffset=1; 229 for (i=1; i<n; i++) { 230 nz = ai[i+1] - ai[i]; 231 /* additional 1 for the term on the diagonal */ 232 AiLo[i]=rowOffset; 233 rowOffset+=nz+1; 234 235 ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 236 ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 237 238 offset+=nz; 239 AjLo[offset]=(PetscInt) i; 240 AALo[offset]=(MatScalar) 1.0; 241 offset+=1; 242 243 v += nz; 244 vi += nz; 245 } 246 cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]); 247 stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat); 248 ierr = cusparseMat->setMatrix(n, n, nzLower, AiLo, AjLo, AALo);CHKERRCUSP(ierr); 249 stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat); 250 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMat; 251 ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr); 252 ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr); 253 ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 254 } catch(char* ex) { 255 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 256 } 257 } 258 PetscFunctionReturn(0); 259 } 260 261 #undef __FUNCT__ 262 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildUpperTriMatrix" 263 PetscErrorCode MatSeqAIJCUSPARSEBuildUpperTriMatrix(Mat A) 264 { 265 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 266 PetscInt n = A->rmap->n; 267 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 268 GPU_Matrix_Ifc* cusparseMat = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 269 cusparseStatus_t stat; 270 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 271 const MatScalar *aa = a->a,*v; 272 PetscInt *AiUp, *AjUp; 273 PetscScalar *AAUp; 274 PetscInt i,nz, nzUpper, offset; 275 PetscErrorCode ierr; 276 277 PetscFunctionBegin; 278 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 279 try { 280 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 281 nzUpper = adiag[0]-adiag[n]; 282 283 /* Allocate Space for the upper triangular matrix */ 284 ierr = cudaMallocHost((void **) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 285 ierr = cudaMallocHost((void **) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 286 ierr = cudaMallocHost((void **) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 287 288 /* Fill the upper triangular matrix */ 289 AiUp[0]=(PetscInt) 0; 290 AiUp[n]=nzUpper; 291 offset = nzUpper; 292 for (i=n-1; i>=0; i--) { 293 v = aa + adiag[i+1] + 1; 294 vi = aj + adiag[i+1] + 1; 295 296 /* number of elements NOT on the diagonal */ 297 nz = adiag[i] - adiag[i+1]-1; 298 299 /* decrement the offset */ 300 offset -= (nz+1); 301 302 /* first, set the diagonal elements */ 303 AjUp[offset] = (PetscInt) i; 304 AAUp[offset] = 1./v[nz]; 305 AiUp[i] = AiUp[i+1] - (nz+1); 306 307 ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 308 ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 309 } 310 cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]); 311 stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 312 ierr = cusparseMat->setMatrix(n, n, nzUpper, AiUp, AjUp, AAUp);CHKERRCUSP(ierr); 313 stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat); 314 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMat; 315 ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 316 ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 317 ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 318 } catch(char* ex) { 319 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 320 } 321 } 322 PetscFunctionReturn(0); 323 } 324 325 #undef __FUNCT__ 326 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalysisAndCopyToGPU" 327 PetscErrorCode MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(Mat A) 328 { 329 PetscErrorCode ierr; 330 Mat_SeqAIJ *a=(Mat_SeqAIJ *)A->data; 331 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 332 IS isrow = a->row,iscol = a->icol; 333 PetscBool row_identity,col_identity; 334 const PetscInt *r,*c; 335 PetscInt n = A->rmap->n; 336 337 PetscFunctionBegin; 338 ierr = MatSeqAIJCUSPARSEBuildLowerTriMatrix(A);CHKERRQ(ierr); 339 ierr = MatSeqAIJCUSPARSEBuildUpperTriMatrix(A);CHKERRQ(ierr); 340 cusparseTriFactors->tempvec = new CUSPARRAY; 341 cusparseTriFactors->tempvec->resize(n); 342 343 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 344 /*lower triangular indices */ 345 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 346 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 347 if (!row_identity) 348 ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(r, n);CHKERRCUSP(ierr); 349 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 350 351 /*upper triangular indices */ 352 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 353 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 354 if (!col_identity) 355 ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(c, n);CHKERRCUSP(ierr); 356 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 357 PetscFunctionReturn(0); 358 } 359 360 #undef __FUNCT__ 361 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE" 362 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 363 { 364 PetscErrorCode ierr; 365 Mat_SeqAIJ *b=(Mat_SeqAIJ *)B->data; 366 IS isrow = b->row,iscol = b->col; 367 PetscBool row_identity,col_identity; 368 369 PetscFunctionBegin; 370 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 371 /* determine which version of MatSolve needs to be used. */ 372 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 373 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 374 if (row_identity && col_identity) B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 375 else B->ops->solve = MatSolve_SeqAIJCUSPARSE; 376 377 /* get the triangular factors */ 378 ierr = MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 379 PetscFunctionReturn(0); 380 } 381 382 383 384 #undef __FUNCT__ 385 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE" 386 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 387 { 388 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 389 PetscErrorCode ierr; 390 CUSPARRAY *xGPU=PETSC_NULL, *bGPU=PETSC_NULL; 391 cusparseStatus_t stat; 392 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 393 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 394 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 395 CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec; 396 397 PetscFunctionBegin; 398 /* Get the GPU pointers */ 399 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 400 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 401 402 /* solve with reordering */ 403 ierr = cusparseMatLo->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr); 404 stat = cusparseMatLo->solve(xGPU, tempGPU);CHKERRCUSP(stat); 405 stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat); 406 ierr = cusparseMatUp->reorderOut(xGPU);CHKERRCUSP(ierr); 407 408 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 409 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 410 ierr = WaitForGPU();CHKERRCUSP(ierr); 411 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 412 PetscFunctionReturn(0); 413 } 414 415 416 417 418 #undef __FUNCT__ 419 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering" 420 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 421 { 422 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 423 PetscErrorCode ierr; 424 CUSPARRAY *xGPU=PETSC_NULL, *bGPU=PETSC_NULL; 425 cusparseStatus_t stat; 426 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 427 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 428 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 429 CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec; 430 431 PetscFunctionBegin; 432 /* Get the GPU pointers */ 433 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 434 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 435 436 /* solve */ 437 stat = cusparseMatLo->solve(bGPU, tempGPU);CHKERRCUSP(stat); 438 stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat); 439 440 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 441 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 442 ierr = WaitForGPU();CHKERRCUSP(ierr); 443 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 444 PetscFunctionReturn(0); 445 } 446 447 #undef __FUNCT__ 448 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU" 449 PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 450 { 451 452 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 453 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 454 PetscInt m = A->rmap->n,*ii,*ridx; 455 PetscErrorCode ierr; 456 457 458 PetscFunctionBegin; 459 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 460 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 461 /* 462 It may be possible to reuse nonzero structure with new matrix values but 463 for simplicity and insured correctness we delete and build a new matrix on 464 the GPU. Likely a very small performance hit. 465 */ 466 if (cusparseMat->mat) { 467 try { 468 delete cusparseMat->mat; 469 if (cusparseMat->tempvec) 470 delete cusparseMat->tempvec; 471 472 } catch(char* ex) { 473 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 474 } 475 } 476 try { 477 cusparseMat->nonzerorow=0; 478 for (int j = 0; j<m; j++) 479 cusparseMat->nonzerorow += ((a->i[j+1]-a->i[j])>0); 480 481 if (a->compressedrow.use) { 482 m = a->compressedrow.nrows; 483 ii = a->compressedrow.i; 484 ridx = a->compressedrow.rindex; 485 } else { 486 /* Forcing compressed row on the GPU ... only relevant for CSR storage */ 487 int k=0; 488 ierr = PetscMalloc((cusparseMat->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr); 489 ierr = PetscMalloc((cusparseMat->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr); 490 ii[0]=0; 491 for (int j = 0; j<m; j++) { 492 if ((a->i[j+1]-a->i[j])>0) { 493 ii[k] = a->i[j]; 494 ridx[k]= j; 495 k++; 496 } 497 } 498 ii[cusparseMat->nonzerorow] = a->nz; 499 m = cusparseMat->nonzerorow; 500 } 501 502 /* Build our matrix ... first determine the GPU storage type */ 503 cusparseMat->mat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseMat->format]); 504 505 /* Create the streams and events (if desired). */ 506 PetscMPIInt size; 507 ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr); 508 ierr = cusparseMat->mat->buildStreamsAndEvents(size, &theBodyStream);CHKERRCUSP(ierr); 509 510 /* FILL MODE UPPER is irrelevant */ 511 cusparseStatus_t stat = cusparseMat->mat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 512 513 /* lastly, build the matrix */ 514 ierr = cusparseMat->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);CHKERRCUSP(ierr); 515 cusparseMat->mat->setCPRowIndices(ridx, m); 516 if (!a->compressedrow.use) { 517 ierr = PetscFree(ii);CHKERRQ(ierr); 518 ierr = PetscFree(ridx);CHKERRQ(ierr); 519 } 520 cusparseMat->tempvec = new CUSPARRAY; 521 cusparseMat->tempvec->resize(m); 522 } catch(char* ex) { 523 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 524 } 525 ierr = WaitForGPU();CHKERRCUSP(ierr); 526 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 527 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 528 } 529 PetscFunctionReturn(0); 530 } 531 532 #undef __FUNCT__ 533 #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE" 534 PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left) 535 { 536 PetscErrorCode ierr; 537 538 PetscFunctionBegin; 539 if (right) { 540 ierr = VecCreate(((PetscObject)mat)->comm,right);CHKERRQ(ierr); 541 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 542 ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr); 543 ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr); 544 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 545 } 546 if (left) { 547 ierr = VecCreate(((PetscObject)mat)->comm,left);CHKERRQ(ierr); 548 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 549 ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr); 550 ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr); 551 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 552 } 553 PetscFunctionReturn(0); 554 } 555 556 #undef __FUNCT__ 557 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE" 558 PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 559 { 560 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 561 PetscErrorCode ierr; 562 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr; 563 CUSPARRAY *xarray=PETSC_NULL,*yarray=PETSC_NULL; 564 565 PetscFunctionBegin; 566 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 567 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 568 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 569 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 570 try { 571 ierr = cusparseMat->mat->multiply(xarray, yarray);CHKERRCUSP(ierr); 572 } catch (char* ex) { 573 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 574 } 575 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 576 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 577 if (!cusparseMat->mat->hasNonZeroStream()) { 578 ierr = WaitForGPU();CHKERRCUSP(ierr); 579 } 580 ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr); 581 PetscFunctionReturn(0); 582 } 583 584 585 #undef __FUNCT__ 586 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE" 587 PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 588 { 589 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 590 PetscErrorCode ierr; 591 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr; 592 CUSPARRAY *xarray=PETSC_NULL,*yarray=PETSC_NULL; 593 594 PetscFunctionBegin; 595 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 596 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 597 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 598 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 599 try { 600 #if !defined(PETSC_USE_COMPLEX) 601 ierr = cusparseMat->mat->multiply(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr); 602 #else 603 ierr = cusparseMat->mat->multiply(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr); 604 #endif 605 } catch (char* ex) { 606 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 607 } 608 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 609 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 610 if (!cusparseMat->mat->hasNonZeroStream()) { 611 ierr = WaitForGPU();CHKERRCUSP(ierr); 612 } 613 ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr); 614 PetscFunctionReturn(0); 615 } 616 617 #undef __FUNCT__ 618 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 619 PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 620 { 621 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 622 PetscErrorCode ierr; 623 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr; 624 CUSPARRAY *xarray=PETSC_NULL,*yarray=PETSC_NULL,*zarray=PETSC_NULL; 625 626 PetscFunctionBegin; 627 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 628 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 629 try { 630 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 631 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 632 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 633 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 634 635 /* multiply add */ 636 ierr = cusparseMat->mat->multiplyAdd(xarray, zarray);CHKERRCUSP(ierr); 637 638 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 639 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 640 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 641 642 } catch(char* ex) { 643 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 644 } 645 ierr = WaitForGPU();CHKERRCUSP(ierr); 646 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 647 PetscFunctionReturn(0); 648 } 649 650 #undef __FUNCT__ 651 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 652 PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 653 { 654 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 655 PetscErrorCode ierr; 656 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr; 657 CUSPARRAY *xarray,*yarray,*zarray; 658 659 PetscFunctionBegin; 660 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 661 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 662 try { 663 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 664 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 665 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 666 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 667 668 /* multiply add with matrix transpose */ 669 #if !defined(PETSC_USE_COMPLEX) 670 ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr); 671 #else 672 ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr); 673 #endif 674 675 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 676 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 677 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 678 679 } catch(char* ex) { 680 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 681 } 682 ierr = WaitForGPU();CHKERRCUSP(ierr); 683 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 684 PetscFunctionReturn(0); 685 } 686 687 #undef __FUNCT__ 688 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE" 689 PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 690 { 691 PetscErrorCode ierr; 692 693 PetscFunctionBegin; 694 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 695 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 696 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 697 A->ops->mult = MatMult_SeqAIJCUSPARSE; 698 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 699 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 700 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 701 PetscFunctionReturn(0); 702 } 703 704 /* --------------------------------------------------------------------------------*/ 705 #undef __FUNCT__ 706 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE" 707 /*@ 708 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 709 (the default parallel PETSc format). This matrix will ultimately pushed down 710 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 711 assembly performance the user should preallocate the matrix storage by setting 712 the parameter nz (or the array nnz). By setting these parameters accurately, 713 performance during matrix assembly can be increased by more than a factor of 50. 714 715 Collective on MPI_Comm 716 717 Input Parameters: 718 + comm - MPI communicator, set to PETSC_COMM_SELF 719 . m - number of rows 720 . n - number of columns 721 . nz - number of nonzeros per row (same for all rows) 722 - nnz - array containing the number of nonzeros in the various rows 723 (possibly different for each row) or PETSC_NULL 724 725 Output Parameter: 726 . A - the matrix 727 728 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 729 MatXXXXSetPreallocation() paradgm instead of this routine directly. 730 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 731 732 Notes: 733 If nnz is given then nz is ignored 734 735 The AIJ format (also called the Yale sparse matrix format or 736 compressed row storage), is fully compatible with standard Fortran 77 737 storage. That is, the stored row and column indices can begin at 738 either one (as in Fortran) or zero. See the users' manual for details. 739 740 Specify the preallocated storage with either nz or nnz (not both). 741 Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory 742 allocation. For large problems you MUST preallocate memory or you 743 will get TERRIBLE performance, see the users' manual chapter on matrices. 744 745 By default, this format uses inodes (identical nodes) when possible, to 746 improve numerical efficiency of matrix-vector products and solves. We 747 search for consecutive rows with the same nonzero structure, thereby 748 reusing matrix information to achieve increased efficiency. 749 750 Level: intermediate 751 752 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 753 @*/ 754 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 755 { 756 PetscErrorCode ierr; 757 758 PetscFunctionBegin; 759 ierr = MatCreate(comm,A);CHKERRQ(ierr); 760 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 761 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 762 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 763 PetscFunctionReturn(0); 764 } 765 766 #undef __FUNCT__ 767 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE" 768 PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 769 { 770 PetscErrorCode ierr; 771 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 772 773 PetscFunctionBegin; 774 if (A->factortype==MAT_FACTOR_NONE) { 775 try { 776 if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) { 777 delete (GPU_Matrix_Ifc *)(cusparseMat->mat); 778 } 779 if (cusparseMat->tempvec!=0) 780 delete cusparseMat->tempvec; 781 delete cusparseMat; 782 A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 783 } catch(char* ex) { 784 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 785 } 786 } else { 787 /* The triangular factors */ 788 try { 789 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 790 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 791 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 792 delete (GPU_Matrix_Ifc *)cusparseMatLo; 793 delete (GPU_Matrix_Ifc *)cusparseMatUp; 794 delete (CUSPARRAY*) cusparseTriFactors->tempvec; 795 delete cusparseTriFactors; 796 } catch(char* ex) { 797 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 798 } 799 } 800 if (MAT_cusparseHandle) { 801 cusparseStatus_t stat; 802 stat = cusparseDestroy(MAT_cusparseHandle);CHKERRCUSP(stat); 803 MAT_cusparseHandle=0; 804 } 805 /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */ 806 A->spptr = 0; 807 808 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 809 PetscFunctionReturn(0); 810 } 811 812 EXTERN_C_BEGIN 813 #undef __FUNCT__ 814 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE" 815 PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 816 { 817 PetscErrorCode ierr; 818 819 PetscFunctionBegin; 820 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 821 if (B->factortype==MAT_FACTOR_NONE) { 822 /* you cannot check the inode.use flag here since the matrix was just created. 823 now build a GPU matrix data structure */ 824 B->spptr = new Mat_SeqAIJCUSPARSE; 825 ((Mat_SeqAIJCUSPARSE *)B->spptr)->mat = 0; 826 ((Mat_SeqAIJCUSPARSE *)B->spptr)->tempvec = 0; 827 ((Mat_SeqAIJCUSPARSE *)B->spptr)->format = MAT_CUSPARSE_CSR; 828 } else { 829 /* NEXT, set the pointers to the triangular factors */ 830 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 831 ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->loTriFactorPtr = 0; 832 ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->upTriFactorPtr = 0; 833 ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->tempvec = 0; 834 ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->format = cusparseMatSolveStorageFormat; 835 } 836 /* Create a single instance of the MAT_cusparseHandle for any matrix (matMult, TriSolve, ...) */ 837 if (!MAT_cusparseHandle) { 838 cusparseStatus_t stat; 839 stat = cusparseCreate(&MAT_cusparseHandle);CHKERRCUSP(stat); 840 } 841 /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the 842 default cusparse tri solve. Note the difference with the implementation in 843 MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */ 844 ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatGetFactor_petsc_C","MatGetFactor_seqaij_cusparse",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr); 845 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 846 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 847 B->ops->getvecs = MatGetVecs_SeqAIJCUSPARSE; 848 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 849 B->ops->mult = MatMult_SeqAIJCUSPARSE; 850 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 851 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 852 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 853 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 854 B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 855 ierr = PetscObjectComposeFunctionDynamic((PetscObject)B, "MatCUSPARSESetFormat_C", "MatCUSPARSESetFormat_SeqAIJCUSPARSE", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 856 PetscFunctionReturn(0); 857 } 858 EXTERN_C_END 859 860 /*M 861 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 862 863 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 864 CSR, ELL, or Hybrid format. All matrix calculations are performed on Nvidia GPUs using 865 the CUSPARSE library. This type is only available when using the 'txpetscgpu' package. 866 Use --download-txpetscgpu to build/install PETSc to use different CUSPARSE library and 867 the different GPU storage formats. 868 869 Options Database Keys: 870 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 871 . -mat_cusparse_storage_format csr (ell (ellpack) or hyb (hybrid)) sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Only available with 'txpetscgpu' package. 872 . -mat_cusparse_mult_storage_format csr (ell (ellpack) or hyb (hybrid)) sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Only available with 'txpetscgpu' package. 873 - -mat_cusparse_solve_storage_format csr (ell (ellpack) or hyb (hybrid)) sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Only available with 'txpetscgpu' package. 874 875 Level: beginner 876 877 .seealso: MatCreateMPIAIJ,MATSEQAIJ,MATMPIAIJ, MATMPIAIJCUSPARSE, MATSEQAIJCUSPARSE, MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 878 M*/ 879