1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format. 4 */ 5 6 #include "petscconf.h" 7 PETSC_CUDA_EXTERN_C_BEGIN 8 #include "../src/mat/impls/aij/seq/aij.h" /*I "petscmat.h" I*/ 9 //#include "petscbt.h" 10 #include "../src/vec/vec/impls/dvecimpl.h" 11 #include "petsc-private/vecimpl.h" 12 PETSC_CUDA_EXTERN_C_END 13 #undef VecType 14 #include "cusparsematimpl.h" 15 const char * const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 16 17 /* this is such a hack ... but I don't know of another way to pass this variable 18 from one GPU_Matrix_Ifc class to another. This is necessary for the parallel 19 SpMV. Essentially, I need to use the same stream variable in two different 20 data structures. I do this by creating a single instance of that stream 21 and reuse it. */ 22 cudaStream_t theBodyStream=0; 23 24 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 25 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 26 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo *); 27 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 28 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 29 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat); 30 PetscErrorCode MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(Mat); 31 PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 32 PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 33 PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 34 PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 35 36 #undef __FUNCT__ 37 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse" 38 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type) 39 { 40 PetscFunctionBegin; 41 *type = MATSOLVERCUSPARSE; 42 PetscFunctionReturn(0); 43 } 44 45 EXTERN_C_BEGIN 46 extern PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*); 47 EXTERN_C_END 48 /* 49 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers (ILU) for seq matrices 50 on the GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp 51 52 Level: beginner 53 */ 54 55 EXTERN_C_BEGIN 56 #undef __FUNCT__ 57 #define __FUNCT__ "MatGetFactor_seqaij_cusparse" 58 PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B) 59 { 60 PetscErrorCode ierr; 61 62 PetscFunctionBegin; 63 ierr = MatGetFactor_seqaij_petsc(A,ftype,B);CHKERRQ(ierr); 64 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 65 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 66 ierr = MatSetFromOptions_SeqAIJCUSPARSE(*B);CHKERRQ(ierr); 67 ierr = PetscObjectComposeFunctionDynamic((PetscObject)(*B),"MatFactorGetSolverPackage_C","MatFactorGetSolverPackage_seqaij_cusparse",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr); 68 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 69 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 70 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 71 (*B)->factortype = ftype; 72 PetscFunctionReturn(0); 73 } 74 EXTERN_C_END 75 76 EXTERN_C_BEGIN 77 #undef __FUNCT__ 78 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE" 79 PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 80 { 81 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 82 PetscFunctionBegin; 83 switch (op) { 84 case MAT_CUSPARSE_MULT: 85 cusparseMat->format = format; 86 break; 87 case MAT_CUSPARSE_SOLVE: 88 cusparseMatSolveStorageFormat = format; 89 break; 90 case MAT_CUSPARSE_ALL: 91 cusparseMat->format = format; 92 cusparseMatSolveStorageFormat = format; 93 break; 94 default: 95 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL are currently supported.",op); 96 } 97 PetscFunctionReturn(0); 98 } 99 EXTERN_C_END 100 101 102 /*@ 103 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 104 operation. Only the MatMult operation can use different GPU storage formats 105 for AIJCUSPARSE matrices. This requires the txpetscgpu package. Use --download-txpetscgpu 106 to build/install PETSc to use this package. 107 108 Not Collective 109 110 Input Parameters: 111 + A : Matrix of type SEQAIJCUSPARSE 112 . op : MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 113 - format : MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB) 114 115 Output Parameter: 116 117 Level: intermediate 118 119 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEARSEFormatOperation 120 @*/ 121 #undef __FUNCT__ 122 #define __FUNCT__ "MatCUSPARSESetFormat" 123 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 124 { 125 PetscErrorCode ierr; 126 PetscFunctionBegin; 127 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 128 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 129 PetscFunctionReturn(0); 130 } 131 132 #undef __FUNCT__ 133 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE" 134 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A) 135 { 136 PetscErrorCode ierr; 137 MatCUSPARSEStorageFormat format; 138 PetscBool flg; 139 PetscFunctionBegin; 140 ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr); 141 ierr = PetscObjectOptionsBegin((PetscObject)A); 142 if (A->factortype==MAT_FACTOR_NONE) { 143 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 144 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 145 if (flg) { 146 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 147 } 148 } 149 else { 150 ierr = PetscOptionsEnum("-mat_cusparse_solve_storage_format","sets storage format of (seq)aijcusparse gpu matrices for TriSolve", 151 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 152 if (flg) { 153 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_SOLVE,format);CHKERRQ(ierr); 154 } 155 } 156 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 157 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 158 if (flg) { 159 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 160 } 161 ierr = PetscOptionsEnd();CHKERRQ(ierr); 162 PetscFunctionReturn(0); 163 164 } 165 166 #undef __FUNCT__ 167 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE" 168 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 169 { 170 PetscErrorCode ierr; 171 172 PetscFunctionBegin; 173 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 174 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 175 PetscFunctionReturn(0); 176 } 177 178 #undef __FUNCT__ 179 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE" 180 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 181 { 182 PetscErrorCode ierr; 183 184 PetscFunctionBegin; 185 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 186 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 187 PetscFunctionReturn(0); 188 } 189 190 #undef __FUNCT__ 191 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildLowerTriMatrix" 192 PetscErrorCode MatSeqAIJCUSPARSEBuildLowerTriMatrix(Mat A) 193 { 194 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 195 PetscInt n = A->rmap->n; 196 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 197 GPU_Matrix_Ifc* cusparseMat = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 198 cusparseStatus_t stat; 199 const PetscInt *ai = a->i,*aj = a->j,*vi; 200 const MatScalar *aa = a->a,*v; 201 PetscErrorCode ierr; 202 PetscInt *AiLo, *AjLo; 203 PetscScalar *AALo; 204 PetscInt i,nz, nzLower, offset, rowOffset; 205 206 PetscFunctionBegin; 207 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 208 try { 209 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 210 nzLower=n+ai[n]-ai[1]; 211 212 /* Allocate Space for the lower triangular matrix */ 213 ierr = cudaMallocHost((void **) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 214 ierr = cudaMallocHost((void **) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr); 215 ierr = cudaMallocHost((void **) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr); 216 217 /* Fill the lower triangular matrix */ 218 AiLo[0]=(PetscInt) 0; 219 AiLo[n]=nzLower; 220 AjLo[0]=(PetscInt) 0; 221 AALo[0]=(MatScalar) 1.0; 222 v = aa; 223 vi = aj; 224 offset=1; 225 rowOffset=1; 226 for (i=1; i<n; i++) { 227 nz = ai[i+1] - ai[i]; 228 /* additional 1 for the term on the diagonal */ 229 AiLo[i]=rowOffset; 230 rowOffset+=nz+1; 231 232 ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 233 ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 234 235 offset+=nz; 236 AjLo[offset]=(PetscInt) i; 237 AALo[offset]=(MatScalar) 1.0; 238 offset+=1; 239 240 v += nz; 241 vi += nz; 242 } 243 cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]); 244 stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat); 245 ierr = cusparseMat->setMatrix(n, n, nzLower, AiLo, AjLo, AALo);CHKERRCUSP(ierr); 246 stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat); 247 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMat; 248 ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr); 249 ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr); 250 ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 251 } catch(char* ex) { 252 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 253 } 254 } 255 PetscFunctionReturn(0); 256 } 257 258 #undef __FUNCT__ 259 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildUpperTriMatrix" 260 PetscErrorCode MatSeqAIJCUSPARSEBuildUpperTriMatrix(Mat A) 261 { 262 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 263 PetscInt n = A->rmap->n; 264 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 265 GPU_Matrix_Ifc* cusparseMat = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 266 cusparseStatus_t stat; 267 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 268 const MatScalar *aa = a->a,*v; 269 PetscInt *AiUp, *AjUp; 270 PetscScalar *AAUp; 271 PetscInt i,nz, nzUpper, offset; 272 PetscErrorCode ierr; 273 274 PetscFunctionBegin; 275 276 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 277 try { 278 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 279 nzUpper = adiag[0]-adiag[n]; 280 281 /* Allocate Space for the upper triangular matrix */ 282 ierr = cudaMallocHost((void **) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 283 ierr = cudaMallocHost((void **) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 284 ierr = cudaMallocHost((void **) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 285 286 /* Fill the upper triangular matrix */ 287 AiUp[0]=(PetscInt) 0; 288 AiUp[n]=nzUpper; 289 offset = nzUpper; 290 for (i=n-1; i>=0; i--) { 291 v = aa + adiag[i+1] + 1; 292 vi = aj + adiag[i+1] + 1; 293 294 /* number of elements NOT on the diagonal */ 295 nz = adiag[i] - adiag[i+1]-1; 296 297 /* decrement the offset */ 298 offset -= (nz+1); 299 300 /* first, set the diagonal elements */ 301 AjUp[offset] = (PetscInt) i; 302 AAUp[offset] = 1./v[nz]; 303 AiUp[i] = AiUp[i+1] - (nz+1); 304 305 ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 306 ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 307 } 308 cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]); 309 stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 310 ierr = cusparseMat->setMatrix(n, n, nzUpper, AiUp, AjUp, AAUp);CHKERRCUSP(ierr); 311 stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat); 312 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMat; 313 ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 314 ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 315 ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 316 } catch(char* ex) { 317 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 318 } 319 } 320 PetscFunctionReturn(0); 321 } 322 323 #undef __FUNCT__ 324 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalysisAndCopyToGPU" 325 PetscErrorCode MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(Mat A) 326 { 327 PetscErrorCode ierr; 328 Mat_SeqAIJ *a=(Mat_SeqAIJ *)A->data; 329 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 330 IS isrow = a->row,iscol = a->icol; 331 PetscBool row_identity,col_identity; 332 const PetscInt *r,*c; 333 PetscInt n = A->rmap->n; 334 335 PetscFunctionBegin; 336 ierr = MatSeqAIJCUSPARSEBuildLowerTriMatrix(A);CHKERRQ(ierr); 337 ierr = MatSeqAIJCUSPARSEBuildUpperTriMatrix(A);CHKERRQ(ierr); 338 cusparseTriFactors->tempvec = new CUSPARRAY; 339 cusparseTriFactors->tempvec->resize(n); 340 341 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 342 /*lower triangular indices */ 343 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 344 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 345 if (!row_identity) 346 ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(r, n);CHKERRCUSP(ierr); 347 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 348 349 /*upper triangular indices */ 350 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 351 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 352 if (!col_identity) 353 ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(c, n);CHKERRCUSP(ierr); 354 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 355 PetscFunctionReturn(0); 356 } 357 358 #undef __FUNCT__ 359 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE" 360 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 361 { 362 PetscErrorCode ierr; 363 Mat_SeqAIJ *b=(Mat_SeqAIJ *)B->data; 364 IS isrow = b->row,iscol = b->col; 365 PetscBool row_identity,col_identity; 366 367 PetscFunctionBegin; 368 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 369 /* determine which version of MatSolve needs to be used. */ 370 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 371 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 372 if (row_identity && col_identity) B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 373 else B->ops->solve = MatSolve_SeqAIJCUSPARSE; 374 375 /* get the triangular factors */ 376 ierr = MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 377 PetscFunctionReturn(0); 378 } 379 380 381 382 #undef __FUNCT__ 383 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE" 384 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 385 { 386 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 387 PetscErrorCode ierr; 388 CUSPARRAY *xGPU, *bGPU; 389 cusparseStatus_t stat; 390 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 391 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 392 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 393 CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec; 394 395 PetscFunctionBegin; 396 /* Get the GPU pointers */ 397 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 398 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 399 400 /* solve with reordering */ 401 ierr = cusparseMatLo->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr); 402 stat = cusparseMatLo->solve(xGPU, tempGPU);CHKERRCUSP(stat); 403 stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat); 404 ierr = cusparseMatUp->reorderOut(xGPU);CHKERRCUSP(ierr); 405 406 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 407 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 408 ierr = WaitForGPU();CHKERRCUSP(ierr); 409 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 410 PetscFunctionReturn(0); 411 } 412 413 414 415 416 #undef __FUNCT__ 417 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering" 418 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 419 { 420 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 421 PetscErrorCode ierr; 422 CUSPARRAY *xGPU, *bGPU; 423 cusparseStatus_t stat; 424 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 425 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 426 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 427 CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec; 428 429 PetscFunctionBegin; 430 /* Get the GPU pointers */ 431 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 432 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 433 434 /* solve */ 435 stat = cusparseMatLo->solve(bGPU, tempGPU);CHKERRCUSP(stat); 436 stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat); 437 438 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 439 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 440 ierr = WaitForGPU();CHKERRCUSP(ierr); 441 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 442 PetscFunctionReturn(0); 443 } 444 445 #undef __FUNCT__ 446 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU" 447 PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 448 { 449 450 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 451 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 452 PetscInt m = A->rmap->n,*ii,*ridx; 453 PetscErrorCode ierr; 454 455 456 PetscFunctionBegin; 457 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 458 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 459 /* 460 It may be possible to reuse nonzero structure with new matrix values but 461 for simplicity and insured correctness we delete and build a new matrix on 462 the GPU. Likely a very small performance hit. 463 */ 464 if (cusparseMat->mat) { 465 try { 466 delete cusparseMat->mat; 467 if (cusparseMat->tempvec) 468 delete cusparseMat->tempvec; 469 470 } catch(char* ex) { 471 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 472 } 473 } 474 try { 475 cusparseMat->nonzerorow=0; 476 for (int j = 0; j<m; j++) 477 cusparseMat->nonzerorow += ((a->i[j+1]-a->i[j])>0); 478 479 if (a->compressedrow.use) { 480 m = a->compressedrow.nrows; 481 ii = a->compressedrow.i; 482 ridx = a->compressedrow.rindex; 483 } else { 484 /* Forcing compressed row on the GPU ... only relevant for CSR storage */ 485 int k=0; 486 ierr = PetscMalloc((cusparseMat->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr); 487 ierr = PetscMalloc((cusparseMat->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr); 488 ii[0]=0; 489 for (int j = 0; j<m; j++) { 490 if ((a->i[j+1]-a->i[j])>0) { 491 ii[k] = a->i[j]; 492 ridx[k]= j; 493 k++; 494 } 495 } 496 ii[cusparseMat->nonzerorow] = a->nz; 497 m = cusparseMat->nonzerorow; 498 } 499 500 /* Build our matrix ... first determine the GPU storage type */ 501 cusparseMat->mat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseMat->format]); 502 503 /* Create the streams and events (if desired). */ 504 PetscMPIInt size; 505 ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr); 506 ierr = cusparseMat->mat->buildStreamsAndEvents(size, &theBodyStream);CHKERRCUSP(ierr); 507 508 /* FILL MODE UPPER is irrelevant */ 509 cusparseStatus_t stat = cusparseMat->mat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 510 511 /* lastly, build the matrix */ 512 ierr = cusparseMat->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);CHKERRCUSP(ierr); 513 cusparseMat->mat->setCPRowIndices(ridx, m); 514 if (!a->compressedrow.use) { 515 ierr = PetscFree(ii);CHKERRQ(ierr); 516 ierr = PetscFree(ridx);CHKERRQ(ierr); 517 } 518 cusparseMat->tempvec = new CUSPARRAY; 519 cusparseMat->tempvec->resize(m); 520 } catch(char* ex) { 521 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 522 } 523 ierr = WaitForGPU();CHKERRCUSP(ierr); 524 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 525 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 526 } 527 PetscFunctionReturn(0); 528 } 529 530 #undef __FUNCT__ 531 #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE" 532 PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left) 533 { 534 PetscErrorCode ierr; 535 536 PetscFunctionBegin; 537 538 if (right) { 539 ierr = VecCreate(((PetscObject)mat)->comm,right);CHKERRQ(ierr); 540 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 541 ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr); 542 ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr); 543 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 544 } 545 if (left) { 546 ierr = VecCreate(((PetscObject)mat)->comm,left);CHKERRQ(ierr); 547 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 548 ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr); 549 ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr); 550 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 551 } 552 PetscFunctionReturn(0); 553 } 554 555 #undef __FUNCT__ 556 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE" 557 PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 558 { 559 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 560 PetscErrorCode ierr; 561 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr; 562 CUSPARRAY *xarray,*yarray; 563 564 PetscFunctionBegin; 565 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 566 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 567 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 568 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 569 try { 570 ierr = cusparseMat->mat->multiply(xarray, yarray);CHKERRCUSP(ierr); 571 } catch (char* ex) { 572 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 573 } 574 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 575 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 576 if (!cusparseMat->mat->hasNonZeroStream()) { 577 ierr = WaitForGPU();CHKERRCUSP(ierr); 578 } 579 ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr); 580 PetscFunctionReturn(0); 581 } 582 583 584 #undef __FUNCT__ 585 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE" 586 PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 587 { 588 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 589 PetscErrorCode ierr; 590 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr; 591 CUSPARRAY *xarray,*yarray; 592 593 PetscFunctionBegin; 594 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 595 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 596 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 597 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 598 try { 599 #if !defined(PETSC_USE_COMPLEX) 600 ierr = cusparseMat->mat->multiply(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr); 601 #else 602 ierr = cusparseMat->mat->multiply(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr); 603 #endif 604 } catch (char* ex) { 605 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 606 } 607 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 608 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 609 if (!cusparseMat->mat->hasNonZeroStream()) { 610 ierr = WaitForGPU();CHKERRCUSP(ierr); 611 } 612 ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr); 613 PetscFunctionReturn(0); 614 } 615 616 #undef __FUNCT__ 617 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 618 PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 619 { 620 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 621 PetscErrorCode ierr; 622 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr; 623 CUSPARRAY *xarray,*yarray,*zarray; 624 PetscFunctionBegin; 625 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 626 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 627 try { 628 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 629 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 630 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 631 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 632 633 /* multiply add */ 634 ierr = cusparseMat->mat->multiplyAdd(xarray, zarray);CHKERRCUSP(ierr); 635 636 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 637 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 638 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 639 640 } catch(char* ex) { 641 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 642 } 643 ierr = WaitForGPU();CHKERRCUSP(ierr); 644 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 645 PetscFunctionReturn(0); 646 } 647 648 #undef __FUNCT__ 649 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 650 PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 651 { 652 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 653 PetscErrorCode ierr; 654 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr; 655 CUSPARRAY *xarray,*yarray,*zarray; 656 PetscFunctionBegin; 657 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 658 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 659 try { 660 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 661 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 662 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 663 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 664 665 /* multiply add with matrix transpose */ 666 #if !defined(PETSC_USE_COMPLEX) 667 ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr); 668 #else 669 ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr); 670 #endif 671 672 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 673 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 674 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 675 676 } catch(char* ex) { 677 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 678 } 679 ierr = WaitForGPU();CHKERRCUSP(ierr); 680 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 681 PetscFunctionReturn(0); 682 } 683 684 #undef __FUNCT__ 685 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE" 686 PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 687 { 688 PetscErrorCode ierr; 689 PetscFunctionBegin; 690 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 691 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 692 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 693 A->ops->mult = MatMult_SeqAIJCUSPARSE; 694 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 695 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 696 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 697 PetscFunctionReturn(0); 698 } 699 700 /* --------------------------------------------------------------------------------*/ 701 #undef __FUNCT__ 702 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE" 703 /*@ 704 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 705 (the default parallel PETSc format). This matrix will ultimately pushed down 706 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 707 assembly performance the user should preallocate the matrix storage by setting 708 the parameter nz (or the array nnz). By setting these parameters accurately, 709 performance during matrix assembly can be increased by more than a factor of 50. 710 711 Collective on MPI_Comm 712 713 Input Parameters: 714 + comm - MPI communicator, set to PETSC_COMM_SELF 715 . m - number of rows 716 . n - number of columns 717 . nz - number of nonzeros per row (same for all rows) 718 - nnz - array containing the number of nonzeros in the various rows 719 (possibly different for each row) or PETSC_NULL 720 721 Output Parameter: 722 . A - the matrix 723 724 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 725 MatXXXXSetPreallocation() paradgm instead of this routine directly. 726 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 727 728 Notes: 729 If nnz is given then nz is ignored 730 731 The AIJ format (also called the Yale sparse matrix format or 732 compressed row storage), is fully compatible with standard Fortran 77 733 storage. That is, the stored row and column indices can begin at 734 either one (as in Fortran) or zero. See the users' manual for details. 735 736 Specify the preallocated storage with either nz or nnz (not both). 737 Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory 738 allocation. For large problems you MUST preallocate memory or you 739 will get TERRIBLE performance, see the users' manual chapter on matrices. 740 741 By default, this format uses inodes (identical nodes) when possible, to 742 improve numerical efficiency of matrix-vector products and solves. We 743 search for consecutive rows with the same nonzero structure, thereby 744 reusing matrix information to achieve increased efficiency. 745 746 Level: intermediate 747 748 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 749 @*/ 750 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 751 { 752 PetscErrorCode ierr; 753 754 PetscFunctionBegin; 755 ierr = MatCreate(comm,A);CHKERRQ(ierr); 756 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 757 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 758 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 759 PetscFunctionReturn(0); 760 } 761 762 #undef __FUNCT__ 763 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE" 764 PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 765 { 766 PetscErrorCode ierr; 767 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 768 769 PetscFunctionBegin; 770 if (A->factortype==MAT_FACTOR_NONE) { 771 try { 772 if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) { 773 delete (GPU_Matrix_Ifc *)(cusparseMat->mat); 774 } 775 if (cusparseMat->tempvec!=0) 776 delete cusparseMat->tempvec; 777 delete cusparseMat; 778 A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 779 } catch(char* ex) { 780 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 781 } 782 } else { 783 /* The triangular factors */ 784 try { 785 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 786 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 787 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 788 delete (GPU_Matrix_Ifc *)cusparseMatLo; 789 delete (GPU_Matrix_Ifc *)cusparseMatUp; 790 delete (CUSPARRAY*) cusparseTriFactors->tempvec; 791 delete cusparseTriFactors; 792 } catch(char* ex) { 793 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 794 } 795 } 796 if (MAT_cusparseHandle) { 797 cusparseStatus_t stat; 798 stat = cusparseDestroy(MAT_cusparseHandle);CHKERRCUSP(stat); 799 MAT_cusparseHandle=0; 800 } 801 /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */ 802 A->spptr = 0; 803 804 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 805 PetscFunctionReturn(0); 806 } 807 808 EXTERN_C_BEGIN 809 #undef __FUNCT__ 810 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE" 811 PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 812 { 813 PetscErrorCode ierr; 814 815 PetscFunctionBegin; 816 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 817 if (B->factortype==MAT_FACTOR_NONE) { 818 /* you cannot check the inode.use flag here since the matrix was just created. 819 now build a GPU matrix data structure */ 820 B->spptr = new Mat_SeqAIJCUSPARSE; 821 ((Mat_SeqAIJCUSPARSE *)B->spptr)->mat = 0; 822 ((Mat_SeqAIJCUSPARSE *)B->spptr)->tempvec = 0; 823 ((Mat_SeqAIJCUSPARSE *)B->spptr)->format = MAT_CUSPARSE_CSR; 824 } else { 825 /* NEXT, set the pointers to the triangular factors */ 826 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 827 ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->loTriFactorPtr = 0; 828 ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->upTriFactorPtr = 0; 829 ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->tempvec = 0; 830 ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->format = cusparseMatSolveStorageFormat; 831 } 832 /* Create a single instance of the MAT_cusparseHandle for any matrix (matMult, TriSolve, ...) */ 833 if (!MAT_cusparseHandle) { 834 cusparseStatus_t stat; 835 stat = cusparseCreate(&MAT_cusparseHandle);CHKERRCUSP(stat); 836 } 837 /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the 838 default cusparse tri solve. Note the difference with the implementation in 839 MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */ 840 ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatGetFactor_petsc_C","MatGetFactor_seqaij_cusparse",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr); 841 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 842 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 843 B->ops->getvecs = MatGetVecs_SeqAIJCUSPARSE; 844 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 845 B->ops->mult = MatMult_SeqAIJCUSPARSE; 846 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 847 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 848 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 849 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 850 B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 851 ierr = PetscObjectComposeFunctionDynamic((PetscObject)B, "MatCUSPARSESetFormat_C", "MatCUSPARSESetFormat_SeqAIJCUSPARSE", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 852 PetscFunctionReturn(0); 853 } 854 EXTERN_C_END 855 856 /*M 857 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 858 859 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 860 CSR, ELL, or Hybrid format. All matrix calculations are performed on Nvidia GPUs using 861 the CUSPARSE library. This type is only available when using the 'txpetscgpu' package. 862 Use --download-txpetscgpu to build/install PETSc to use different CUSPARSE library and 863 the different GPU storage formats. 864 865 Options Database Keys: 866 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 867 . -mat_cusparse_storage_format csr (ell (ellpack) or hyb (hybrid)) sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Only available with 'txpetscgpu' package. 868 . -mat_cusparse_mult_storage_format csr (ell (ellpack) or hyb (hybrid)) sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Only available with 'txpetscgpu' package. 869 - -mat_cusparse_solve_storage_format csr (ell (ellpack) or hyb (hybrid)) sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Only available with 'txpetscgpu' package. 870 871 Level: beginner 872 873 .seealso: MatCreateMPIAIJ,MATSEQAIJ,MATMPIAIJ, MATMPIAIJCUSPARSE, MATSEQAIJCUSPARSE, MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 874 M*/ 875