1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format. 4 */ 5 6 #include "petscconf.h" 7 PETSC_CUDA_EXTERN_C_BEGIN 8 #include "../src/mat/impls/aij/seq/aij.h" /*I "petscmat.h" I*/ 9 #include <../src/mat/impls/sbaij/seq/sbaij.h> 10 11 #include "../src/vec/vec/impls/dvecimpl.h" 12 #include "petsc-private/vecimpl.h" 13 PETSC_CUDA_EXTERN_C_END 14 #undef VecType 15 #include "cusparsematimpl.h" 16 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 17 18 /* this is such a hack ... but I don't know of another way to pass this variable 19 from one GPU_Matrix_Ifc class to another. This is necessary for the parallel 20 SpMV. Essentially, I need to use the same stream variable in two different 21 data structures. I do this by creating a single instance of that stream 22 and reuse it. */ 23 cudaStream_t theBodyStream=0; 24 25 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 26 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 27 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 28 29 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 30 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 31 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 32 33 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 34 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 35 PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 36 PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 37 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat); 38 PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 39 PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 40 PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 41 PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 42 43 #undef __FUNCT__ 44 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse" 45 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type) 46 { 47 PetscFunctionBegin; 48 *type = MATSOLVERCUSPARSE; 49 PetscFunctionReturn(0); 50 } 51 52 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*); 53 54 55 /*MC 56 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 57 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 58 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 59 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 60 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 61 algorithms are not recommended. This class does NOT support direct solver operations. 62 63 ./configure --download-txpetscgpu to install PETSc to use CUSPARSE 64 65 Consult CUSPARSE documentation for more information about the matrix storage formats 66 which correspond to the options database keys below. 67 68 Options Database Keys: 69 . -mat_cusparse_solve_storage_format csr - sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package. 70 71 Level: beginner 72 73 .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 74 M*/ 75 76 #undef __FUNCT__ 77 #define __FUNCT__ "MatGetFactor_seqaij_cusparse" 78 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B) 79 { 80 PetscErrorCode ierr; 81 82 PetscFunctionBegin; 83 ierr = MatGetFactor_seqaij_petsc(A,ftype,B);CHKERRQ(ierr); 84 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 85 ierr = MatSetFromOptions_SeqAIJCUSPARSE(*B);CHKERRQ(ierr); 86 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 87 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 88 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 89 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 90 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 91 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 92 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 93 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C","MatFactorGetSolverPackage_seqaij_cusparse",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr); 94 (*B)->factortype = ftype; 95 PetscFunctionReturn(0); 96 } 97 98 #undef __FUNCT__ 99 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE" 100 PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 101 { 102 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 103 104 PetscFunctionBegin; 105 switch (op) { 106 case MAT_CUSPARSE_MULT: 107 cusparseMat->format = format; 108 break; 109 case MAT_CUSPARSE_SOLVE: 110 cusparseMatSolveStorageFormat = format; 111 break; 112 case MAT_CUSPARSE_ALL: 113 cusparseMat->format = format; 114 cusparseMatSolveStorageFormat = format; 115 break; 116 default: 117 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL are currently supported.",op); 118 } 119 PetscFunctionReturn(0); 120 } 121 122 /*@ 123 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 124 operation. Only the MatMult operation can use different GPU storage formats 125 for MPIAIJCUSPARSE matrices. This requires the txpetscgpu package. Use --download-txpetscgpu 126 to build/install PETSc to use this package. 127 128 Not Collective 129 130 Input Parameters: 131 + A - Matrix of type SEQAIJCUSPARSE 132 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 133 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB) 134 135 Output Parameter: 136 137 Level: intermediate 138 139 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 140 @*/ 141 #undef __FUNCT__ 142 #define __FUNCT__ "MatCUSPARSESetFormat" 143 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 144 { 145 PetscErrorCode ierr; 146 147 PetscFunctionBegin; 148 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 149 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 150 PetscFunctionReturn(0); 151 } 152 153 #undef __FUNCT__ 154 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE" 155 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A) 156 { 157 PetscErrorCode ierr; 158 MatCUSPARSEStorageFormat format; 159 PetscBool flg; 160 161 PetscFunctionBegin; 162 ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr); 163 ierr = PetscObjectOptionsBegin((PetscObject)A); 164 if (A->factortype==MAT_FACTOR_NONE) { 165 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 166 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 167 if (flg) { 168 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 169 } 170 } else { 171 ierr = PetscOptionsEnum("-mat_cusparse_solve_storage_format","sets storage format of (seq)aijcusparse gpu matrices for TriSolve", 172 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 173 if (flg) { 174 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_SOLVE,format);CHKERRQ(ierr); 175 } 176 } 177 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 178 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 179 if (flg) { 180 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 181 } 182 ierr = PetscOptionsEnd();CHKERRQ(ierr); 183 PetscFunctionReturn(0); 184 185 } 186 187 #undef __FUNCT__ 188 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE" 189 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 190 { 191 PetscErrorCode ierr; 192 193 PetscFunctionBegin; 194 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 195 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 196 PetscFunctionReturn(0); 197 } 198 199 #undef __FUNCT__ 200 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE" 201 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 202 { 203 PetscErrorCode ierr; 204 205 PetscFunctionBegin; 206 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 207 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 208 PetscFunctionReturn(0); 209 } 210 211 #undef __FUNCT__ 212 #define __FUNCT__ "MatICCFactorSymbolic_SeqAIJCUSPARSE" 213 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 214 { 215 PetscErrorCode ierr; 216 217 PetscFunctionBegin; 218 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 219 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 220 PetscFunctionReturn(0); 221 } 222 223 #undef __FUNCT__ 224 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqAIJCUSPARSE" 225 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 226 { 227 PetscErrorCode ierr; 228 229 PetscFunctionBegin; 230 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 231 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 232 PetscFunctionReturn(0); 233 } 234 235 #undef __FUNCT__ 236 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILULowerTriMatrix" 237 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 238 { 239 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 240 PetscInt n = A->rmap->n; 241 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 242 GPU_Matrix_Ifc * cusparseMat = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 243 cusparseStatus_t stat; 244 const PetscInt *ai = a->i,*aj = a->j,*vi; 245 const MatScalar *aa = a->a,*v; 246 PetscErrorCode ierr; 247 PetscInt *AiLo, *AjLo; 248 PetscScalar *AALo; 249 PetscInt i,nz, nzLower, offset, rowOffset; 250 251 PetscFunctionBegin; 252 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 253 try { 254 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 255 nzLower=n+ai[n]-ai[1]; 256 257 /* Allocate Space for the lower triangular matrix */ 258 ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 259 ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr); 260 ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr); 261 262 /* Fill the lower triangular matrix */ 263 AiLo[0] = (PetscInt) 0; 264 AiLo[n] = nzLower; 265 AjLo[0] = (PetscInt) 0; 266 AALo[0] = (MatScalar) 1.0; 267 v = aa; 268 vi = aj; 269 offset = 1; 270 rowOffset= 1; 271 for (i=1; i<n; i++) { 272 nz = ai[i+1] - ai[i]; 273 /* additional 1 for the term on the diagonal */ 274 AiLo[i] = rowOffset; 275 rowOffset += nz+1; 276 277 ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 278 ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 279 280 offset += nz; 281 AjLo[offset] = (PetscInt) i; 282 AALo[offset] = (MatScalar) 1.0; 283 offset += 1; 284 285 v += nz; 286 vi += nz; 287 } 288 cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]); 289 290 stat = cusparseMat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_LOWER, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat); 291 ierr = cusparseMat->setMatrix(n, n, nzLower, AiLo, AjLo, AALo);CHKERRCUSP(ierr); 292 stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat); 293 294 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMat; 295 296 ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr); 297 ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr); 298 ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 299 } catch(char *ex) { 300 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 301 } 302 } 303 PetscFunctionReturn(0); 304 } 305 306 #undef __FUNCT__ 307 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILUUpperTriMatrix" 308 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 309 { 310 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 311 PetscInt n = A->rmap->n; 312 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 313 GPU_Matrix_Ifc * cusparseMat = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 314 cusparseStatus_t stat; 315 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 316 const MatScalar *aa = a->a,*v; 317 PetscInt *AiUp, *AjUp; 318 PetscScalar *AAUp; 319 PetscInt i,nz, nzUpper, offset; 320 PetscErrorCode ierr; 321 322 PetscFunctionBegin; 323 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 324 try { 325 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 326 nzUpper = adiag[0]-adiag[n]; 327 328 /* Allocate Space for the upper triangular matrix */ 329 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 330 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 331 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 332 333 /* Fill the upper triangular matrix */ 334 AiUp[0]=(PetscInt) 0; 335 AiUp[n]=nzUpper; 336 offset = nzUpper; 337 for (i=n-1; i>=0; i--) { 338 v = aa + adiag[i+1] + 1; 339 vi = aj + adiag[i+1] + 1; 340 341 /* number of elements NOT on the diagonal */ 342 nz = adiag[i] - adiag[i+1]-1; 343 344 /* decrement the offset */ 345 offset -= (nz+1); 346 347 /* first, set the diagonal elements */ 348 AjUp[offset] = (PetscInt) i; 349 AAUp[offset] = 1./v[nz]; 350 AiUp[i] = AiUp[i+1] - (nz+1); 351 352 ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 353 ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 354 } 355 cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]); 356 357 stat = cusparseMat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 358 ierr = cusparseMat->setMatrix(n, n, nzUpper, AiUp, AjUp, AAUp);CHKERRCUSP(ierr); 359 stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat); 360 361 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMat; 362 363 ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 364 ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 365 ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 366 } catch(char *ex) { 367 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 368 } 369 } 370 PetscFunctionReturn(0); 371 } 372 373 #undef __FUNCT__ 374 #define __FUNCT__ "MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU" 375 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 376 { 377 PetscErrorCode ierr; 378 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 379 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 380 IS isrow = a->row,iscol = a->icol; 381 PetscBool row_identity,col_identity; 382 const PetscInt *r,*c; 383 PetscInt n = A->rmap->n; 384 385 PetscFunctionBegin; 386 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 387 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 388 389 cusparseTriFactors->tempvec = new CUSPARRAY; 390 cusparseTriFactors->tempvec->resize(n); 391 392 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 393 /*lower triangular indices */ 394 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 395 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 396 if (!row_identity) { 397 ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(r, n);CHKERRCUSP(ierr); 398 } 399 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 400 401 /*upper triangular indices */ 402 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 403 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 404 if (!col_identity) { 405 ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(c, n);CHKERRCUSP(ierr); 406 } 407 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 408 PetscFunctionReturn(0); 409 } 410 411 #undef __FUNCT__ 412 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildICCTriMatrices" 413 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 414 { 415 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 416 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 417 GPU_Matrix_Ifc * cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 418 GPU_Matrix_Ifc * cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 419 cusparseStatus_t stat; 420 PetscErrorCode ierr; 421 PetscInt *AiUp, *AjUp; 422 PetscScalar *AAUp; 423 PetscScalar *AALo; 424 PetscInt nzUpper=a->nz,n=A->rmap->n,i,offset,nz,j; 425 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 426 const PetscInt *ai=b->i,*aj=b->j,*vj; 427 const MatScalar *aa=b->a,*v; 428 429 PetscFunctionBegin; 430 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 431 try { 432 /* Allocate Space for the upper triangular matrix */ 433 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 434 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 435 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 436 ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 437 438 /* Fill the upper triangular matrix */ 439 AiUp[0]=(PetscInt) 0; 440 AiUp[n]=nzUpper; 441 offset = 0; 442 for (i=0; i<n; i++) { 443 /* set the pointers */ 444 v = aa + ai[i]; 445 vj = aj + ai[i]; 446 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 447 448 /* first, set the diagonal elements */ 449 AjUp[offset] = (PetscInt) i; 450 AAUp[offset] = 1.0/v[nz]; 451 AiUp[i] = offset; 452 AALo[offset] = 1.0/v[nz]; 453 454 offset+=1; 455 if (nz>0) { 456 ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr); 457 ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 458 for (j=offset; j<offset+nz; j++) { 459 AAUp[j] = -AAUp[j]; 460 AALo[j] = AAUp[j]/v[nz]; 461 } 462 offset+=nz; 463 } 464 } 465 466 /* Build the upper triangular piece */ 467 cusparseMatUp = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]); 468 stat = cusparseMatUp->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat); 469 ierr = cusparseMatUp->setMatrix(A->rmap->n, A->cmap->n, a->nz, AiUp, AjUp, AAUp);CHKERRCUSP(ierr); 470 stat = cusparseMatUp->solveAnalysis();CHKERRCUSP(stat); 471 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMatUp; 472 473 /* Build the lower triangular piece */ 474 cusparseMatLo = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]); 475 stat = cusparseMatLo->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 476 ierr = cusparseMatLo->setMatrix(A->rmap->n, A->cmap->n, a->nz, AiUp, AjUp, AALo);CHKERRCUSP(ierr); 477 stat = cusparseMatLo->solveAnalysis(CUSPARSE_OPERATION_TRANSPOSE);CHKERRCUSP(stat); 478 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMatLo; 479 480 /* set this flag ... for performance logging */ 481 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->isSymmOrHerm = PETSC_TRUE; 482 483 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 484 ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 485 ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 486 ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 487 ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 488 } catch(char *ex) { 489 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 490 } 491 } 492 PetscFunctionReturn(0); 493 } 494 495 #undef __FUNCT__ 496 #define __FUNCT__ "MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU" 497 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 498 { 499 PetscErrorCode ierr; 500 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 501 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 502 IS ip=a->row; 503 const PetscInt *rip; 504 PetscBool perm_identity; 505 PetscInt n = A->rmap->n; 506 507 PetscFunctionBegin; 508 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 509 cusparseTriFactors->tempvec = new CUSPARRAY; 510 cusparseTriFactors->tempvec->resize(n); 511 /*lower triangular indices */ 512 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 513 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 514 if (!perm_identity) { 515 ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(rip, n);CHKERRCUSP(ierr); 516 ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(rip, n);CHKERRCUSP(ierr); 517 } 518 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 519 PetscFunctionReturn(0); 520 } 521 522 #undef __FUNCT__ 523 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE" 524 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 525 { 526 PetscErrorCode ierr; 527 Mat_SeqAIJ *b =(Mat_SeqAIJ*)B->data; 528 IS isrow = b->row,iscol = b->col; 529 PetscBool row_identity,col_identity; 530 531 PetscFunctionBegin; 532 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 533 /* determine which version of MatSolve needs to be used. */ 534 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 535 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 536 if (row_identity && col_identity) { 537 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 538 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 539 } else { 540 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 541 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 542 } 543 544 /* get the triangular factors */ 545 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 546 PetscFunctionReturn(0); 547 } 548 549 #undef __FUNCT__ 550 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqAIJCUSPARSE" 551 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 552 { 553 PetscErrorCode ierr; 554 Mat_SeqAIJ *b =(Mat_SeqAIJ*)B->data; 555 IS ip=b->row; 556 PetscBool perm_identity; 557 558 PetscFunctionBegin; 559 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 560 561 /* determine which version of MatSolve needs to be used. */ 562 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 563 if (perm_identity) { 564 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 565 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 566 } else { 567 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 568 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 569 } 570 571 /* get the triangular factors */ 572 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 573 PetscFunctionReturn(0); 574 } 575 576 #undef __FUNCT__ 577 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve" 578 PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 579 { 580 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 581 GPU_Matrix_Ifc* cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 582 GPU_Matrix_Ifc* cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 583 cusparseStatus_t stat; 584 PetscFunctionBegin; 585 stat = cusparseMatLo->initializeCusparseMatTranspose(MAT_cusparseHandle, 586 CUSPARSE_MATRIX_TYPE_TRIANGULAR, 587 CUSPARSE_FILL_MODE_UPPER, 588 CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat); 589 stat = cusparseMatLo->solveAnalysisTranspose();CHKERRCUSP(stat); 590 591 stat = cusparseMatUp->initializeCusparseMatTranspose(MAT_cusparseHandle, 592 CUSPARSE_MATRIX_TYPE_TRIANGULAR, 593 CUSPARSE_FILL_MODE_LOWER, 594 CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 595 stat = cusparseMatUp->solveAnalysisTranspose();CHKERRCUSP(stat); 596 PetscFunctionReturn(0); 597 } 598 599 #undef __FUNCT__ 600 #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult" 601 PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 602 { 603 PetscErrorCode ierr; 604 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 605 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 606 cusparseStatus_t stat; 607 PetscFunctionBegin; 608 if (cusparseMat->isSymmOrHerm) { 609 stat = cusparseMat->mat->initializeCusparseMatTranspose(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_SYMMETRIC, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 610 } else { 611 stat = cusparseMat->mat->initializeCusparseMatTranspose(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_LOWER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 612 } 613 ierr = cusparseMat->mat->setMatrixTranspose(A->rmap->n, A->cmap->n, a->nz, a->i, a->j, a->a);CHKERRCUSP(ierr); 614 PetscFunctionReturn(0); 615 } 616 617 #undef __FUNCT__ 618 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE" 619 PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 620 { 621 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 622 PetscErrorCode ierr; 623 CUSPARRAY *xGPU, *bGPU; 624 cusparseStatus_t stat; 625 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 626 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 627 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 628 CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec; 629 630 PetscFunctionBegin; 631 /* Analyze the matrix ... on the fly */ 632 if (!cusparseTriFactors->hasTranspose) { 633 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 634 cusparseTriFactors->hasTranspose=PETSC_TRUE; 635 } 636 637 /* Get the GPU pointers */ 638 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 639 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 640 641 /* solve with reordering */ 642 ierr = cusparseMatUp->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr); 643 stat = cusparseMatUp->solveTranspose(xGPU, tempGPU);CHKERRCUSP(stat); 644 stat = cusparseMatLo->solveTranspose(tempGPU, xGPU);CHKERRCUSP(stat); 645 ierr = cusparseMatLo->reorderOut(xGPU);CHKERRCUSP(ierr); 646 647 /* restore */ 648 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 649 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 650 ierr = WaitForGPU();CHKERRCUSP(ierr); 651 652 if (cusparseTriFactors->isSymmOrHerm) { 653 ierr = PetscLogFlops(4.0*a->nz - 3.0*A->cmap->n);CHKERRQ(ierr); 654 } else { 655 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 656 } 657 PetscFunctionReturn(0); 658 } 659 660 #undef __FUNCT__ 661 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering" 662 PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 663 { 664 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 665 PetscErrorCode ierr; 666 CUSPARRAY *xGPU, *bGPU; 667 cusparseStatus_t stat; 668 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 669 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 670 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 671 CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec; 672 673 PetscFunctionBegin; 674 /* Analyze the matrix ... on the fly */ 675 if (!cusparseTriFactors->hasTranspose) { 676 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 677 cusparseTriFactors->hasTranspose=PETSC_TRUE; 678 } 679 680 /* Get the GPU pointers */ 681 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 682 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 683 684 /* solve */ 685 stat = cusparseMatUp->solveTranspose(bGPU, tempGPU);CHKERRCUSP(stat); 686 stat = cusparseMatLo->solveTranspose(tempGPU, xGPU);CHKERRCUSP(stat); 687 688 /* restore */ 689 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 690 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 691 ierr = WaitForGPU();CHKERRCUSP(ierr); 692 if (cusparseTriFactors->isSymmOrHerm) { 693 ierr = PetscLogFlops(4.0*a->nz - 3.0*A->cmap->n);CHKERRQ(ierr); 694 } else { 695 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 696 } 697 PetscFunctionReturn(0); 698 } 699 700 701 #undef __FUNCT__ 702 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE" 703 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 704 { 705 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 706 PetscErrorCode ierr; 707 CUSPARRAY *xGPU, *bGPU; 708 cusparseStatus_t stat; 709 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 710 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 711 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 712 CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec; 713 714 PetscFunctionBegin; 715 /* Get the GPU pointers */ 716 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 717 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 718 719 /* solve with reordering */ 720 ierr = cusparseMatLo->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr); 721 stat = cusparseMatLo->solve(xGPU, tempGPU);CHKERRCUSP(stat); 722 stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat); 723 ierr = cusparseMatUp->reorderOut(xGPU);CHKERRCUSP(ierr); 724 725 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 726 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 727 ierr = WaitForGPU();CHKERRCUSP(ierr); 728 if (cusparseTriFactors->isSymmOrHerm) { 729 ierr = PetscLogFlops(4.0*a->nz - 3.0*A->cmap->n);CHKERRQ(ierr); 730 } else { 731 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 732 } 733 PetscFunctionReturn(0); 734 } 735 736 737 738 739 #undef __FUNCT__ 740 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering" 741 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 742 { 743 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 744 PetscErrorCode ierr; 745 CUSPARRAY *xGPU, *bGPU; 746 cusparseStatus_t stat; 747 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 748 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 749 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 750 CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec; 751 752 PetscFunctionBegin; 753 /* Get the GPU pointers */ 754 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 755 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 756 757 /* solve */ 758 stat = cusparseMatLo->solve(bGPU, tempGPU);CHKERRCUSP(stat); 759 stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat); 760 761 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 762 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 763 ierr = WaitForGPU();CHKERRCUSP(ierr); 764 if (cusparseTriFactors->isSymmOrHerm) { 765 ierr = PetscLogFlops(4.0*a->nz - 3.0*A->cmap->n);CHKERRQ(ierr); 766 } else { 767 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 768 } 769 PetscFunctionReturn(0); 770 } 771 772 #undef __FUNCT__ 773 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU" 774 PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 775 { 776 777 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 778 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 779 PetscInt m = A->rmap->n,*ii,*ridx; 780 PetscErrorCode ierr; 781 PetscBool symmetryTest=PETSC_FALSE, hermitianTest=PETSC_FALSE; 782 PetscBool symmetryOptionIsSet=PETSC_FALSE, symmetryOptionTest=PETSC_FALSE; 783 784 PetscFunctionBegin; 785 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 786 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 787 /* 788 It may be possible to reuse nonzero structure with new matrix values but 789 for simplicity and insured correctness we delete and build a new matrix on 790 the GPU. Likely a very small performance hit. 791 */ 792 if (cusparseMat->mat) { 793 try { 794 delete cusparseMat->mat; 795 if (cusparseMat->tempvec) delete cusparseMat->tempvec; 796 797 } catch(char *ex) { 798 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 799 } 800 } 801 try { 802 cusparseMat->nonzerorow=0; 803 for (int j = 0; j<m; j++) cusparseMat->nonzerorow += ((a->i[j+1]-a->i[j])>0); 804 805 if (a->compressedrow.use) { 806 m = a->compressedrow.nrows; 807 ii = a->compressedrow.i; 808 ridx = a->compressedrow.rindex; 809 } else { 810 /* Forcing compressed row on the GPU ... only relevant for CSR storage */ 811 int k=0; 812 ierr = PetscMalloc((cusparseMat->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr); 813 ierr = PetscMalloc((cusparseMat->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr); 814 ii[0]=0; 815 for (int j = 0; j<m; j++) { 816 if ((a->i[j+1]-a->i[j])>0) { 817 ii[k] = a->i[j]; 818 ridx[k]= j; 819 k++; 820 } 821 } 822 ii[cusparseMat->nonzerorow] = a->nz; 823 824 m = cusparseMat->nonzerorow; 825 } 826 827 /* Build our matrix ... first determine the GPU storage type */ 828 cusparseMat->mat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseMat->format]); 829 830 /* Create the streams and events (if desired). */ 831 PetscMPIInt size; 832 ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr); 833 ierr = cusparseMat->mat->buildStreamsAndEvents(size, &theBodyStream);CHKERRCUSP(ierr); 834 835 ierr = MatIsSymmetricKnown(A,&symmetryOptionIsSet,&symmetryOptionTest); 836 if ((symmetryOptionIsSet && !symmetryOptionTest) || !symmetryOptionIsSet) { 837 /* CUSPARSE_FILL_MODE_UPPER and CUSPARSE_DIAG_TYPE_NON_UNIT are irrelevant here */ 838 cusparseStatus_t stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 839 cusparseMat->isSymmOrHerm = PETSC_FALSE; 840 } else { 841 ierr = MatIsSymmetric(A,0.0,&symmetryTest); 842 if (symmetryTest) { 843 cusparseStatus_t stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_SYMMETRIC, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 844 cusparseMat->isSymmOrHerm = PETSC_TRUE; 845 } else { 846 ierr = MatIsHermitian(A,0.0,&hermitianTest); 847 if (hermitianTest) { 848 cusparseStatus_t stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_HERMITIAN, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 849 cusparseMat->isSymmOrHerm = PETSC_TRUE; 850 } else { 851 /* CUSPARSE_FILL_MODE_UPPER and CUSPARSE_DIAG_TYPE_NON_UNIT are irrelevant here */ 852 cusparseStatus_t stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 853 cusparseMat->isSymmOrHerm = PETSC_FALSE; 854 } 855 } 856 } 857 858 /* lastly, build the matrix */ 859 ierr = cusparseMat->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);CHKERRCUSP(ierr); 860 cusparseMat->mat->setCPRowIndices(ridx, m); 861 if (!a->compressedrow.use) { 862 ierr = PetscFree(ii);CHKERRQ(ierr); 863 ierr = PetscFree(ridx);CHKERRQ(ierr); 864 } 865 cusparseMat->tempvec = new CUSPARRAY; 866 cusparseMat->tempvec->resize(m); 867 } catch(char *ex) { 868 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 869 } 870 ierr = WaitForGPU();CHKERRCUSP(ierr); 871 872 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 873 874 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 875 } 876 PetscFunctionReturn(0); 877 } 878 879 #undef __FUNCT__ 880 #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE" 881 PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left) 882 { 883 PetscErrorCode ierr; 884 885 PetscFunctionBegin; 886 if (right) { 887 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 888 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 889 ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr); 890 ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr); 891 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 892 } 893 if (left) { 894 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 895 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 896 ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr); 897 ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr); 898 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 899 } 900 PetscFunctionReturn(0); 901 } 902 903 #undef __FUNCT__ 904 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE" 905 PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 906 { 907 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 908 PetscErrorCode ierr; 909 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 910 CUSPARRAY *xarray,*yarray; 911 912 PetscFunctionBegin; 913 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 914 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 915 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 916 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 917 try { 918 ierr = cusparseMat->mat->multiply(xarray, yarray);CHKERRCUSP(ierr); 919 } catch (char *ex) { 920 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 921 } 922 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 923 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 924 if (!cusparseMat->mat->hasNonZeroStream()) { 925 ierr = WaitForGPU();CHKERRCUSP(ierr); 926 } 927 if (cusparseMat->isSymmOrHerm) { 928 ierr = PetscLogFlops(4.0*a->nz - 3.0*cusparseMat->nonzerorow);CHKERRQ(ierr); 929 } else { 930 ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr); 931 } 932 PetscFunctionReturn(0); 933 } 934 935 936 #undef __FUNCT__ 937 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE" 938 PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 939 { 940 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 941 PetscErrorCode ierr; 942 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 943 CUSPARRAY *xarray,*yarray; 944 945 PetscFunctionBegin; 946 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 947 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 948 if (!cusparseMat->hasTranspose) { 949 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 950 cusparseMat->hasTranspose=PETSC_TRUE; 951 } 952 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 953 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 954 try { 955 ierr = cusparseMat->mat->multiplyTranspose(xarray, yarray);CHKERRCUSP(ierr); 956 } catch (char *ex) { 957 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 958 } 959 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 960 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 961 if (!cusparseMat->mat->hasNonZeroStream()) { 962 ierr = WaitForGPU();CHKERRCUSP(ierr); 963 } 964 if (cusparseMat->isSymmOrHerm) { 965 ierr = PetscLogFlops(4.0*a->nz - 3.0*cusparseMat->nonzerorow);CHKERRQ(ierr); 966 } else { 967 ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr); 968 } 969 PetscFunctionReturn(0); 970 } 971 972 #undef __FUNCT__ 973 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 974 PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 975 { 976 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 977 PetscErrorCode ierr; 978 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 979 CUSPARRAY *xarray,*yarray,*zarray; 980 981 PetscFunctionBegin; 982 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 983 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 984 try { 985 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 986 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 987 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 988 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 989 990 /* multiply add */ 991 ierr = cusparseMat->mat->multiplyAdd(xarray, zarray);CHKERRCUSP(ierr); 992 993 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 994 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 995 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 996 997 } catch(char *ex) { 998 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 999 } 1000 ierr = WaitForGPU();CHKERRCUSP(ierr); 1001 if (cusparseMat->isSymmOrHerm) { 1002 ierr = PetscLogFlops(4.0*a->nz - 2.0*cusparseMat->nonzerorow);CHKERRQ(ierr); 1003 } else { 1004 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1005 } 1006 PetscFunctionReturn(0); 1007 } 1008 1009 #undef __FUNCT__ 1010 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 1011 PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1012 { 1013 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1014 PetscErrorCode ierr; 1015 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 1016 CUSPARRAY *xarray,*yarray,*zarray; 1017 1018 PetscFunctionBegin; 1019 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1020 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1021 if (!cusparseMat->hasTranspose) { 1022 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1023 cusparseMat->hasTranspose=PETSC_TRUE; 1024 } 1025 try { 1026 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 1027 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1028 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 1029 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1030 1031 /* multiply add with matrix transpose */ 1032 ierr = cusparseMat->mat->multiplyAddTranspose(xarray, yarray);CHKERRCUSP(ierr); 1033 1034 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1035 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 1036 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1037 1038 } catch(char *ex) { 1039 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1040 } 1041 ierr = WaitForGPU();CHKERRCUSP(ierr); 1042 if (cusparseMat->isSymmOrHerm) { 1043 ierr = PetscLogFlops(4.0*a->nz - 2.0*cusparseMat->nonzerorow);CHKERRQ(ierr); 1044 } else { 1045 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1046 } 1047 PetscFunctionReturn(0); 1048 } 1049 1050 #undef __FUNCT__ 1051 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE" 1052 PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1053 { 1054 PetscErrorCode ierr; 1055 1056 PetscFunctionBegin; 1057 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1058 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1059 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1060 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1061 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1062 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1063 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1064 PetscFunctionReturn(0); 1065 } 1066 1067 /* --------------------------------------------------------------------------------*/ 1068 #undef __FUNCT__ 1069 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE" 1070 /*@ 1071 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1072 (the default parallel PETSc format). This matrix will ultimately pushed down 1073 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1074 assembly performance the user should preallocate the matrix storage by setting 1075 the parameter nz (or the array nnz). By setting these parameters accurately, 1076 performance during matrix assembly can be increased by more than a factor of 50. 1077 1078 Collective on MPI_Comm 1079 1080 Input Parameters: 1081 + comm - MPI communicator, set to PETSC_COMM_SELF 1082 . m - number of rows 1083 . n - number of columns 1084 . nz - number of nonzeros per row (same for all rows) 1085 - nnz - array containing the number of nonzeros in the various rows 1086 (possibly different for each row) or NULL 1087 1088 Output Parameter: 1089 . A - the matrix 1090 1091 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1092 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1093 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1094 1095 Notes: 1096 If nnz is given then nz is ignored 1097 1098 The AIJ format (also called the Yale sparse matrix format or 1099 compressed row storage), is fully compatible with standard Fortran 77 1100 storage. That is, the stored row and column indices can begin at 1101 either one (as in Fortran) or zero. See the users' manual for details. 1102 1103 Specify the preallocated storage with either nz or nnz (not both). 1104 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1105 allocation. For large problems you MUST preallocate memory or you 1106 will get TERRIBLE performance, see the users' manual chapter on matrices. 1107 1108 By default, this format uses inodes (identical nodes) when possible, to 1109 improve numerical efficiency of matrix-vector products and solves. We 1110 search for consecutive rows with the same nonzero structure, thereby 1111 reusing matrix information to achieve increased efficiency. 1112 1113 Level: intermediate 1114 1115 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1116 @*/ 1117 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1118 { 1119 PetscErrorCode ierr; 1120 1121 PetscFunctionBegin; 1122 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1123 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1124 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1125 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1126 PetscFunctionReturn(0); 1127 } 1128 1129 #undef __FUNCT__ 1130 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE" 1131 PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1132 { 1133 PetscErrorCode ierr; 1134 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 1135 1136 PetscFunctionBegin; 1137 if (A->factortype==MAT_FACTOR_NONE) { 1138 try { 1139 if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) { 1140 delete (GPU_Matrix_Ifc*)(cusparseMat->mat); 1141 } 1142 if (cusparseMat->tempvec!=0) delete cusparseMat->tempvec; 1143 delete cusparseMat; 1144 A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 1145 } catch(char *ex) { 1146 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1147 } 1148 } else { 1149 /* The triangular factors */ 1150 try { 1151 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1152 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 1153 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 1154 delete (GPU_Matrix_Ifc*) cusparseMatLo; 1155 delete (GPU_Matrix_Ifc*) cusparseMatUp; 1156 delete (CUSPARRAY*) cusparseTriFactors->tempvec; 1157 delete cusparseTriFactors; 1158 } catch(char *ex) { 1159 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1160 } 1161 } 1162 if (MAT_cusparseHandle) { 1163 cusparseStatus_t stat; 1164 stat = cusparseDestroy(MAT_cusparseHandle);CHKERRCUSP(stat); 1165 1166 MAT_cusparseHandle=0; 1167 } 1168 /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */ 1169 A->spptr = 0; 1170 1171 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1172 PetscFunctionReturn(0); 1173 } 1174 1175 #undef __FUNCT__ 1176 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE" 1177 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 1178 { 1179 PetscErrorCode ierr; 1180 1181 PetscFunctionBegin; 1182 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 1183 if (B->factortype==MAT_FACTOR_NONE) { 1184 /* you cannot check the inode.use flag here since the matrix was just created. 1185 now build a GPU matrix data structure */ 1186 B->spptr = new Mat_SeqAIJCUSPARSE; 1187 1188 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1189 ((Mat_SeqAIJCUSPARSE*)B->spptr)->tempvec = 0; 1190 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1191 ((Mat_SeqAIJCUSPARSE*)B->spptr)->hasTranspose = PETSC_FALSE; 1192 ((Mat_SeqAIJCUSPARSE*)B->spptr)->isSymmOrHerm = PETSC_FALSE; 1193 } else { 1194 /* NEXT, set the pointers to the triangular factors */ 1195 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1196 1197 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 1198 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1199 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->tempvec = 0; 1200 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->format = cusparseMatSolveStorageFormat; 1201 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->hasTranspose = PETSC_FALSE; 1202 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->isSymmOrHerm = PETSC_FALSE; 1203 } 1204 /* Create a single instance of the MAT_cusparseHandle for any matrix (matMult, TriSolve, ...) */ 1205 if (!MAT_cusparseHandle) { 1206 cusparseStatus_t stat; 1207 stat = cusparseCreate(&MAT_cusparseHandle);CHKERRCUSP(stat); 1208 } 1209 /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the 1210 default cusparse tri solve. Note the difference with the implementation in 1211 MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */ 1212 ierr = PetscObjectComposeFunction((PetscObject)B,"MatGetFactor_petsc_C","MatGetFactor_seqaij_cusparse",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr); 1213 1214 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1215 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1216 B->ops->getvecs = MatGetVecs_SeqAIJCUSPARSE; 1217 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1218 B->ops->mult = MatMult_SeqAIJCUSPARSE; 1219 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1220 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1221 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1222 1223 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1224 1225 B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 1226 1227 ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", "MatCUSPARSESetFormat_SeqAIJCUSPARSE", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1228 PetscFunctionReturn(0); 1229 } 1230 1231 /*M 1232 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1233 1234 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1235 CSR, ELL, or Hybrid format. All matrix calculations are performed on Nvidia GPUs using 1236 the CUSPARSE library. This type is only available when using the 'txpetscgpu' package. 1237 Use --download-txpetscgpu to build/install PETSc to use different CUSPARSE library and 1238 the different GPU storage formats. 1239 1240 Options Database Keys: 1241 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1242 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package. 1243 . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package. 1244 - -mat_cusparse_solve_storage_format csr - sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package. 1245 1246 Level: beginner 1247 1248 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1249 M*/ 1250