1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 6 #include "petscconf.h" 7 #include "../src/mat/impls/aij/seq/aij.h" /*I "petscmat.h" I*/ 8 #include <../src/mat/impls/sbaij/seq/sbaij.h> 9 #include "../src/vec/vec/impls/dvecimpl.h" 10 #include "petsc-private/vecimpl.h" 11 #undef VecType 12 #include "cusparsematimpl.h" 13 14 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 15 16 /* this is such a hack ... but I don't know of another way to pass this variable 17 from one GPU_Matrix_Ifc class to another. This is necessary for the parallel 18 SpMV. Essentially, I need to use the same stream variable in two different 19 data structures. I do this by creating a single instance of that stream 20 and reuse it. */ 21 cudaStream_t theBodyStream=0; 22 23 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 24 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 25 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 26 27 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 28 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 29 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 30 31 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 32 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 33 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 34 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 35 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat); 36 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 37 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 38 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 39 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 40 41 #undef __FUNCT__ 42 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse" 43 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type) 44 { 45 PetscFunctionBegin; 46 *type = MATSOLVERCUSPARSE; 47 PetscFunctionReturn(0); 48 } 49 50 /*MC 51 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 52 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 53 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 54 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 55 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 56 algorithms are not recommended. This class does NOT support direct solver operations. 57 58 ./configure --download-txpetscgpu to install PETSc to use CUSPARSE 59 60 Consult CUSPARSE documentation for more information about the matrix storage formats 61 which correspond to the options database keys below. 62 63 Options Database Keys: 64 . -mat_cusparse_solve_storage_format csr - sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package. 65 66 Level: beginner 67 68 .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 69 M*/ 70 71 #undef __FUNCT__ 72 #define __FUNCT__ "MatGetFactor_seqaij_cusparse" 73 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B) 74 { 75 PetscErrorCode ierr; 76 PetscInt n = A->rmap->n; 77 78 PetscFunctionBegin; 79 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 80 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 81 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 82 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 83 84 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 85 ierr = MatSetBlockSizes(*B,A->rmap->bs,A->cmap->bs);CHKERRQ(ierr); 86 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 87 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 88 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 89 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 90 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 91 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 92 93 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr); 94 (*B)->factortype = ftype; 95 PetscFunctionReturn(0); 96 } 97 98 #undef __FUNCT__ 99 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE" 100 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 101 { 102 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 103 104 PetscFunctionBegin; 105 switch (op) { 106 case MAT_CUSPARSE_MULT: 107 cusparseMat->format = format; 108 break; 109 case MAT_CUSPARSE_SOLVE: 110 cusparseMatSolveStorageFormat = format; 111 break; 112 case MAT_CUSPARSE_ALL: 113 cusparseMat->format = format; 114 cusparseMatSolveStorageFormat = format; 115 break; 116 default: 117 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL are currently supported.",op); 118 } 119 PetscFunctionReturn(0); 120 } 121 122 /*@ 123 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 124 operation. Only the MatMult operation can use different GPU storage formats 125 for MPIAIJCUSPARSE matrices. This requires the txpetscgpu package. Use --download-txpetscgpu 126 to build/install PETSc to use this package. 127 128 Not Collective 129 130 Input Parameters: 131 + A - Matrix of type SEQAIJCUSPARSE 132 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 133 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB) 134 135 Output Parameter: 136 137 Level: intermediate 138 139 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 140 @*/ 141 #undef __FUNCT__ 142 #define __FUNCT__ "MatCUSPARSESetFormat" 143 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 144 { 145 PetscErrorCode ierr; 146 147 PetscFunctionBegin; 148 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 149 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 150 PetscFunctionReturn(0); 151 } 152 153 #undef __FUNCT__ 154 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE" 155 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A) 156 { 157 PetscErrorCode ierr; 158 MatCUSPARSEStorageFormat format; 159 PetscBool flg; 160 161 PetscFunctionBegin; 162 ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr); 163 ierr = PetscObjectOptionsBegin((PetscObject)A); 164 if (A->factortype==MAT_FACTOR_NONE) { 165 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 166 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 167 if (flg) { 168 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 169 } 170 } else { 171 ierr = PetscOptionsEnum("-mat_cusparse_solve_storage_format","sets storage format of (seq)aijcusparse gpu matrices for TriSolve", 172 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 173 if (flg) { 174 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_SOLVE,format);CHKERRQ(ierr); 175 } 176 } 177 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 178 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 179 if (flg) { 180 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 181 } 182 ierr = PetscOptionsEnd();CHKERRQ(ierr); 183 PetscFunctionReturn(0); 184 185 } 186 187 #undef __FUNCT__ 188 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE" 189 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 190 { 191 PetscErrorCode ierr; 192 193 PetscFunctionBegin; 194 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 195 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 196 PetscFunctionReturn(0); 197 } 198 199 #undef __FUNCT__ 200 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE" 201 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 202 { 203 PetscErrorCode ierr; 204 205 PetscFunctionBegin; 206 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 207 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 208 PetscFunctionReturn(0); 209 } 210 211 #undef __FUNCT__ 212 #define __FUNCT__ "MatICCFactorSymbolic_SeqAIJCUSPARSE" 213 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 214 { 215 PetscErrorCode ierr; 216 217 PetscFunctionBegin; 218 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 219 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 220 PetscFunctionReturn(0); 221 } 222 223 #undef __FUNCT__ 224 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqAIJCUSPARSE" 225 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 226 { 227 PetscErrorCode ierr; 228 229 PetscFunctionBegin; 230 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 231 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 232 PetscFunctionReturn(0); 233 } 234 235 #undef __FUNCT__ 236 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILULowerTriMatrix" 237 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 238 { 239 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 240 PetscInt n = A->rmap->n; 241 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 242 GPU_Matrix_Ifc *cusparseMat = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 243 cusparseStatus_t stat; 244 const PetscInt *ai = a->i,*aj = a->j,*vi; 245 const MatScalar *aa = a->a,*v; 246 PetscInt *AiLo, *AjLo; 247 PetscScalar *AALo; 248 PetscInt i,nz, nzLower, offset, rowOffset; 249 PetscErrorCode ierr; 250 251 PetscFunctionBegin; 252 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 253 try { 254 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 255 nzLower=n+ai[n]-ai[1]; 256 257 /* Allocate Space for the lower triangular matrix */ 258 ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 259 ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr); 260 ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr); 261 262 /* Fill the lower triangular matrix */ 263 AiLo[0] = (PetscInt) 0; 264 AiLo[n] = nzLower; 265 AjLo[0] = (PetscInt) 0; 266 AALo[0] = (MatScalar) 1.0; 267 v = aa; 268 vi = aj; 269 offset = 1; 270 rowOffset= 1; 271 for (i=1; i<n; i++) { 272 nz = ai[i+1] - ai[i]; 273 /* additional 1 for the term on the diagonal */ 274 AiLo[i] = rowOffset; 275 rowOffset += nz+1; 276 277 ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 278 ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 279 280 offset += nz; 281 AjLo[offset] = (PetscInt) i; 282 AALo[offset] = (MatScalar) 1.0; 283 offset += 1; 284 285 v += nz; 286 vi += nz; 287 } 288 cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]); 289 290 stat = cusparseMat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_LOWER, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat); 291 ierr = cusparseMat->setMatrix(n, n, nzLower, AiLo, AjLo, AALo);CHKERRCUSP(ierr); 292 stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat); 293 294 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMat; 295 296 ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr); 297 ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr); 298 ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 299 } catch(char *ex) { 300 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 301 } 302 } 303 PetscFunctionReturn(0); 304 } 305 306 #undef __FUNCT__ 307 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILUUpperTriMatrix" 308 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 309 { 310 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 311 PetscInt n = A->rmap->n; 312 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 313 GPU_Matrix_Ifc *cusparseMat = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 314 cusparseStatus_t stat; 315 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 316 const MatScalar *aa = a->a,*v; 317 PetscInt *AiUp, *AjUp; 318 PetscScalar *AAUp; 319 PetscInt i,nz, nzUpper, offset; 320 PetscErrorCode ierr; 321 322 PetscFunctionBegin; 323 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 324 try { 325 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 326 nzUpper = adiag[0]-adiag[n]; 327 328 /* Allocate Space for the upper triangular matrix */ 329 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 330 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 331 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 332 333 /* Fill the upper triangular matrix */ 334 AiUp[0]=(PetscInt) 0; 335 AiUp[n]=nzUpper; 336 offset = nzUpper; 337 for (i=n-1; i>=0; i--) { 338 v = aa + adiag[i+1] + 1; 339 vi = aj + adiag[i+1] + 1; 340 341 /* number of elements NOT on the diagonal */ 342 nz = adiag[i] - adiag[i+1]-1; 343 344 /* decrement the offset */ 345 offset -= (nz+1); 346 347 /* first, set the diagonal elements */ 348 AjUp[offset] = (PetscInt) i; 349 AAUp[offset] = 1./v[nz]; 350 AiUp[i] = AiUp[i+1] - (nz+1); 351 352 ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 353 ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 354 } 355 cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]); 356 357 stat = cusparseMat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 358 ierr = cusparseMat->setMatrix(n, n, nzUpper, AiUp, AjUp, AAUp);CHKERRCUSP(ierr); 359 stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat); 360 361 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMat; 362 363 ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 364 ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 365 ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 366 } catch(char *ex) { 367 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 368 } 369 } 370 PetscFunctionReturn(0); 371 } 372 373 #undef __FUNCT__ 374 #define __FUNCT__ "MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU" 375 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 376 { 377 PetscErrorCode ierr; 378 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 379 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 380 IS isrow = a->row,iscol = a->icol; 381 PetscBool row_identity,col_identity; 382 const PetscInt *r,*c; 383 PetscInt n = A->rmap->n; 384 385 PetscFunctionBegin; 386 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 387 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 388 389 cusparseTriFactors->tempvec = new CUSPARRAY; 390 cusparseTriFactors->tempvec->resize(n); 391 392 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 393 /*lower triangular indices */ 394 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 395 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 396 if (!row_identity) { 397 ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(r, n);CHKERRCUSP(ierr); 398 } 399 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 400 401 /*upper triangular indices */ 402 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 403 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 404 if (!col_identity) { 405 ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(c, n);CHKERRCUSP(ierr); 406 } 407 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 408 PetscFunctionReturn(0); 409 } 410 411 #undef __FUNCT__ 412 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildICCTriMatrices" 413 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 414 { 415 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 416 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 417 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 418 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 419 cusparseStatus_t stat; 420 PetscErrorCode ierr; 421 PetscInt *AiUp, *AjUp; 422 PetscScalar *AAUp; 423 PetscScalar *AALo; 424 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 425 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 426 const PetscInt *ai = b->i,*aj = b->j,*vj; 427 const MatScalar *aa = b->a,*v; 428 429 PetscFunctionBegin; 430 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 431 try { 432 /* Allocate Space for the upper triangular matrix */ 433 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 434 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 435 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 436 ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 437 438 /* Fill the upper triangular matrix */ 439 AiUp[0]=(PetscInt) 0; 440 AiUp[n]=nzUpper; 441 offset = 0; 442 for (i=0; i<n; i++) { 443 /* set the pointers */ 444 v = aa + ai[i]; 445 vj = aj + ai[i]; 446 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 447 448 /* first, set the diagonal elements */ 449 AjUp[offset] = (PetscInt) i; 450 AAUp[offset] = 1.0/v[nz]; 451 AiUp[i] = offset; 452 AALo[offset] = 1.0/v[nz]; 453 454 offset+=1; 455 if (nz>0) { 456 ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr); 457 ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 458 for (j=offset; j<offset+nz; j++) { 459 AAUp[j] = -AAUp[j]; 460 AALo[j] = AAUp[j]/v[nz]; 461 } 462 offset+=nz; 463 } 464 } 465 466 /* Build the upper triangular piece */ 467 cusparseMatUp = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]); 468 stat = cusparseMatUp->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat); 469 ierr = cusparseMatUp->setMatrix(A->rmap->n, A->cmap->n, a->nz, AiUp, AjUp, AAUp);CHKERRCUSP(ierr); 470 stat = cusparseMatUp->solveAnalysis();CHKERRCUSP(stat); 471 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMatUp; 472 473 /* Build the lower triangular piece */ 474 cusparseMatLo = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]); 475 stat = cusparseMatLo->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 476 ierr = cusparseMatLo->setMatrix(A->rmap->n, A->cmap->n, a->nz, AiUp, AjUp, AALo);CHKERRCUSP(ierr); 477 stat = cusparseMatLo->solveAnalysis(CUSPARSE_OPERATION_TRANSPOSE);CHKERRCUSP(stat); 478 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMatLo; 479 480 /* set this flag ... for performance logging */ 481 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->isSymmOrHerm = PETSC_TRUE; 482 483 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 484 ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 485 ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 486 ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 487 ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 488 } catch(char *ex) { 489 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 490 } 491 } 492 PetscFunctionReturn(0); 493 } 494 495 #undef __FUNCT__ 496 #define __FUNCT__ "MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU" 497 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 498 { 499 PetscErrorCode ierr; 500 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 501 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 502 IS ip = a->row; 503 const PetscInt *rip; 504 PetscBool perm_identity; 505 PetscInt n = A->rmap->n; 506 507 PetscFunctionBegin; 508 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 509 cusparseTriFactors->tempvec = new CUSPARRAY; 510 cusparseTriFactors->tempvec->resize(n); 511 /*lower triangular indices */ 512 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 513 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 514 if (!perm_identity) { 515 ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(rip, n);CHKERRCUSP(ierr); 516 ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(rip, n);CHKERRCUSP(ierr); 517 } 518 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 519 PetscFunctionReturn(0); 520 } 521 522 #undef __FUNCT__ 523 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE" 524 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 525 { 526 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 527 IS isrow = b->row,iscol = b->col; 528 PetscBool row_identity,col_identity; 529 PetscErrorCode ierr; 530 531 PetscFunctionBegin; 532 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 533 /* determine which version of MatSolve needs to be used. */ 534 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 535 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 536 if (row_identity && col_identity) { 537 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 538 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 539 } else { 540 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 541 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 542 } 543 544 /* get the triangular factors */ 545 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 546 PetscFunctionReturn(0); 547 } 548 549 #undef __FUNCT__ 550 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqAIJCUSPARSE" 551 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 552 { 553 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 554 IS ip = b->row; 555 PetscBool perm_identity; 556 PetscErrorCode ierr; 557 558 PetscFunctionBegin; 559 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 560 561 /* determine which version of MatSolve needs to be used. */ 562 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 563 if (perm_identity) { 564 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 565 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 566 } else { 567 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 568 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 569 } 570 571 /* get the triangular factors */ 572 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 573 PetscFunctionReturn(0); 574 } 575 576 #undef __FUNCT__ 577 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve" 578 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 579 { 580 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 581 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 582 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 583 cusparseStatus_t stat; 584 585 PetscFunctionBegin; 586 stat = cusparseMatLo->initializeCusparseMatTranspose(MAT_cusparseHandle, 587 CUSPARSE_MATRIX_TYPE_TRIANGULAR, 588 CUSPARSE_FILL_MODE_UPPER, 589 CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat); 590 stat = cusparseMatLo->solveAnalysisTranspose();CHKERRCUSP(stat); 591 592 stat = cusparseMatUp->initializeCusparseMatTranspose(MAT_cusparseHandle, 593 CUSPARSE_MATRIX_TYPE_TRIANGULAR, 594 CUSPARSE_FILL_MODE_LOWER, 595 CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 596 stat = cusparseMatUp->solveAnalysisTranspose();CHKERRCUSP(stat); 597 PetscFunctionReturn(0); 598 } 599 600 #undef __FUNCT__ 601 #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult" 602 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 603 { 604 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 605 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 606 cusparseStatus_t stat; 607 PetscErrorCode ierr; 608 609 PetscFunctionBegin; 610 if (cusparseMat->isSymmOrHerm) { 611 stat = cusparseMat->mat->initializeCusparseMatTranspose(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_SYMMETRIC, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 612 } else { 613 stat = cusparseMat->mat->initializeCusparseMatTranspose(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_LOWER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 614 } 615 ierr = cusparseMat->mat->setMatrixTranspose(A->rmap->n, A->cmap->n, a->nz, a->i, a->j, a->a);CHKERRCUSP(ierr); 616 PetscFunctionReturn(0); 617 } 618 619 #undef __FUNCT__ 620 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE" 621 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 622 { 623 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 624 CUSPARRAY *xGPU, *bGPU; 625 cusparseStatus_t stat; 626 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 627 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 628 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 629 CUSPARRAY *tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec; 630 PetscErrorCode ierr; 631 632 PetscFunctionBegin; 633 /* Analyze the matrix ... on the fly */ 634 if (!cusparseTriFactors->hasTranspose) { 635 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 636 cusparseTriFactors->hasTranspose=PETSC_TRUE; 637 } 638 639 /* Get the GPU pointers */ 640 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 641 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 642 643 /* solve with reordering */ 644 ierr = cusparseMatUp->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr); 645 stat = cusparseMatUp->solveTranspose(xGPU, tempGPU);CHKERRCUSP(stat); 646 stat = cusparseMatLo->solveTranspose(tempGPU, xGPU);CHKERRCUSP(stat); 647 ierr = cusparseMatLo->reorderOut(xGPU);CHKERRCUSP(ierr); 648 649 /* restore */ 650 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 651 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 652 ierr = WaitForGPU();CHKERRCUSP(ierr); 653 654 if (cusparseTriFactors->isSymmOrHerm) { 655 ierr = PetscLogFlops(4.0*a->nz - 3.0*A->cmap->n);CHKERRQ(ierr); 656 } else { 657 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 658 } 659 PetscFunctionReturn(0); 660 } 661 662 #undef __FUNCT__ 663 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering" 664 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 665 { 666 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 667 CUSPARRAY *xGPU,*bGPU; 668 cusparseStatus_t stat; 669 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 670 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 671 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 672 CUSPARRAY *tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec; 673 PetscErrorCode ierr; 674 675 PetscFunctionBegin; 676 /* Analyze the matrix ... on the fly */ 677 if (!cusparseTriFactors->hasTranspose) { 678 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 679 cusparseTriFactors->hasTranspose=PETSC_TRUE; 680 } 681 682 /* Get the GPU pointers */ 683 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 684 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 685 686 /* solve */ 687 stat = cusparseMatUp->solveTranspose(bGPU, tempGPU);CHKERRCUSP(stat); 688 stat = cusparseMatLo->solveTranspose(tempGPU, xGPU);CHKERRCUSP(stat); 689 690 /* restore */ 691 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 692 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 693 ierr = WaitForGPU();CHKERRCUSP(ierr); 694 if (cusparseTriFactors->isSymmOrHerm) { 695 ierr = PetscLogFlops(4.0*a->nz - 3.0*A->cmap->n);CHKERRQ(ierr); 696 } else { 697 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 698 } 699 PetscFunctionReturn(0); 700 } 701 702 #undef __FUNCT__ 703 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE" 704 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 705 { 706 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 707 CUSPARRAY *xGPU,*bGPU; 708 cusparseStatus_t stat; 709 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 710 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 711 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 712 CUSPARRAY *tempGPU = (CUSPARRAY*)cusparseTriFactors->tempvec; 713 PetscErrorCode ierr; 714 715 PetscFunctionBegin; 716 /* Get the GPU pointers */ 717 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 718 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 719 720 /* solve with reordering */ 721 ierr = cusparseMatLo->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr); 722 stat = cusparseMatLo->solve(xGPU, tempGPU);CHKERRCUSP(stat); 723 stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat); 724 ierr = cusparseMatUp->reorderOut(xGPU);CHKERRCUSP(ierr); 725 726 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 727 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 728 ierr = WaitForGPU();CHKERRCUSP(ierr); 729 if (cusparseTriFactors->isSymmOrHerm) { 730 ierr = PetscLogFlops(4.0*a->nz - 3.0*A->cmap->n);CHKERRQ(ierr); 731 } else { 732 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 733 } 734 PetscFunctionReturn(0); 735 } 736 737 #undef __FUNCT__ 738 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering" 739 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 740 { 741 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 742 CUSPARRAY *xGPU,*bGPU; 743 cusparseStatus_t stat; 744 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 745 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 746 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 747 CUSPARRAY *tempGPU = (CUSPARRAY*)cusparseTriFactors->tempvec; 748 PetscErrorCode ierr; 749 750 PetscFunctionBegin; 751 /* Get the GPU pointers */ 752 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 753 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 754 755 /* solve */ 756 stat = cusparseMatLo->solve(bGPU, tempGPU);CHKERRCUSP(stat); 757 stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat); 758 759 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 760 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 761 ierr = WaitForGPU();CHKERRCUSP(ierr); 762 if (cusparseTriFactors->isSymmOrHerm) { 763 ierr = PetscLogFlops(4.0*a->nz - 3.0*A->cmap->n);CHKERRQ(ierr); 764 } else { 765 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 766 } 767 PetscFunctionReturn(0); 768 } 769 770 #undef __FUNCT__ 771 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU" 772 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 773 { 774 775 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 776 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 777 PetscInt m = A->rmap->n,*ii,*ridx; 778 PetscBool symmetryTest=PETSC_FALSE, hermitianTest=PETSC_FALSE; 779 PetscBool symmetryOptionIsSet=PETSC_FALSE, symmetryOptionTest=PETSC_FALSE; 780 PetscErrorCode ierr; 781 782 PetscFunctionBegin; 783 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 784 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 785 /* 786 It may be possible to reuse nonzero structure with new matrix values but 787 for simplicity and insured correctness we delete and build a new matrix on 788 the GPU. Likely a very small performance hit. 789 */ 790 if (cusparseMat->mat) { 791 try { 792 delete cusparseMat->mat; 793 if (cusparseMat->tempvec) delete cusparseMat->tempvec; 794 795 } catch(char *ex) { 796 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 797 } 798 } 799 try { 800 cusparseMat->nonzerorow=0; 801 for (int j = 0; j<m; j++) cusparseMat->nonzerorow += ((a->i[j+1]-a->i[j])>0); 802 803 if (a->compressedrow.use) { 804 m = a->compressedrow.nrows; 805 ii = a->compressedrow.i; 806 ridx = a->compressedrow.rindex; 807 } else { 808 /* Forcing compressed row on the GPU ... only relevant for CSR storage */ 809 int k=0; 810 ierr = PetscMalloc((cusparseMat->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr); 811 ierr = PetscMalloc((cusparseMat->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr); 812 ii[0]=0; 813 for (int j = 0; j<m; j++) { 814 if ((a->i[j+1]-a->i[j])>0) { 815 ii[k] = a->i[j]; 816 ridx[k]= j; 817 k++; 818 } 819 } 820 ii[cusparseMat->nonzerorow] = a->nz; 821 822 m = cusparseMat->nonzerorow; 823 } 824 825 /* Build our matrix ... first determine the GPU storage type */ 826 cusparseMat->mat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseMat->format]); 827 828 /* Create the streams and events (if desired). */ 829 PetscMPIInt size; 830 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr); 831 ierr = cusparseMat->mat->buildStreamsAndEvents(size, &theBodyStream);CHKERRCUSP(ierr); 832 833 ierr = MatIsSymmetricKnown(A,&symmetryOptionIsSet,&symmetryOptionTest);CHKERRQ(ierr); 834 if ((symmetryOptionIsSet && !symmetryOptionTest) || !symmetryOptionIsSet) { 835 /* CUSPARSE_FILL_MODE_UPPER and CUSPARSE_DIAG_TYPE_NON_UNIT are irrelevant here */ 836 cusparseStatus_t stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 837 cusparseMat->isSymmOrHerm = PETSC_FALSE; 838 } else { 839 ierr = MatIsSymmetric(A,0.0,&symmetryTest);CHKERRQ(ierr); 840 if (symmetryTest) { 841 cusparseStatus_t stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_SYMMETRIC, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 842 cusparseMat->isSymmOrHerm = PETSC_TRUE; 843 } else { 844 ierr = MatIsHermitian(A,0.0,&hermitianTest);CHKERRQ(ierr); 845 if (hermitianTest) { 846 cusparseStatus_t stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_HERMITIAN, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 847 cusparseMat->isSymmOrHerm = PETSC_TRUE; 848 } else { 849 /* CUSPARSE_FILL_MODE_UPPER and CUSPARSE_DIAG_TYPE_NON_UNIT are irrelevant here */ 850 cusparseStatus_t stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 851 cusparseMat->isSymmOrHerm = PETSC_FALSE; 852 } 853 } 854 } 855 856 /* lastly, build the matrix */ 857 ierr = cusparseMat->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);CHKERRCUSP(ierr); 858 cusparseMat->mat->setCPRowIndices(ridx, m); 859 if (!a->compressedrow.use) { 860 ierr = PetscFree(ii);CHKERRQ(ierr); 861 ierr = PetscFree(ridx);CHKERRQ(ierr); 862 } 863 cusparseMat->tempvec = new CUSPARRAY; 864 cusparseMat->tempvec->resize(m); 865 } catch(char *ex) { 866 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 867 } 868 ierr = WaitForGPU();CHKERRCUSP(ierr); 869 870 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 871 872 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 873 } 874 PetscFunctionReturn(0); 875 } 876 877 #undef __FUNCT__ 878 #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE" 879 static PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left) 880 { 881 PetscErrorCode ierr; 882 883 PetscFunctionBegin; 884 if (right) { 885 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 886 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 887 ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr); 888 ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr); 889 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 890 } 891 if (left) { 892 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 893 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 894 ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr); 895 ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr); 896 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 897 } 898 PetscFunctionReturn(0); 899 } 900 901 #undef __FUNCT__ 902 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE" 903 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 904 { 905 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 906 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 907 CUSPARRAY *xarray,*yarray; 908 PetscErrorCode ierr; 909 910 PetscFunctionBegin; 911 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 912 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 913 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 914 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 915 try { 916 ierr = cusparseMat->mat->multiply(xarray, yarray);CHKERRCUSP(ierr); 917 } catch (char *ex) { 918 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 919 } 920 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 921 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 922 if (!cusparseMat->mat->hasNonZeroStream()) { 923 ierr = WaitForGPU();CHKERRCUSP(ierr); 924 } 925 if (cusparseMat->isSymmOrHerm) { 926 ierr = PetscLogFlops(4.0*a->nz - 3.0*cusparseMat->nonzerorow);CHKERRQ(ierr); 927 } else { 928 ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr); 929 } 930 PetscFunctionReturn(0); 931 } 932 933 #undef __FUNCT__ 934 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE" 935 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 936 { 937 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 938 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 939 CUSPARRAY *xarray,*yarray; 940 PetscErrorCode ierr; 941 942 PetscFunctionBegin; 943 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 944 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 945 if (!cusparseMat->hasTranspose) { 946 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 947 cusparseMat->hasTranspose=PETSC_TRUE; 948 } 949 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 950 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 951 try { 952 ierr = cusparseMat->mat->multiplyTranspose(xarray, yarray);CHKERRCUSP(ierr); 953 } catch (char *ex) { 954 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 955 } 956 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 957 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 958 if (!cusparseMat->mat->hasNonZeroStream()) { 959 ierr = WaitForGPU();CHKERRCUSP(ierr); 960 } 961 if (cusparseMat->isSymmOrHerm) { 962 ierr = PetscLogFlops(4.0*a->nz - 3.0*cusparseMat->nonzerorow);CHKERRQ(ierr); 963 } else { 964 ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr); 965 } 966 PetscFunctionReturn(0); 967 } 968 969 #undef __FUNCT__ 970 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 971 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 972 { 973 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 974 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 975 CUSPARRAY *xarray,*yarray,*zarray; 976 PetscErrorCode ierr; 977 978 PetscFunctionBegin; 979 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 980 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 981 try { 982 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 983 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 984 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 985 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 986 987 /* multiply add */ 988 ierr = cusparseMat->mat->multiplyAdd(xarray, zarray);CHKERRCUSP(ierr); 989 990 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 991 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 992 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 993 994 } catch(char *ex) { 995 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 996 } 997 ierr = WaitForGPU();CHKERRCUSP(ierr); 998 if (cusparseMat->isSymmOrHerm) { 999 ierr = PetscLogFlops(4.0*a->nz - 2.0*cusparseMat->nonzerorow);CHKERRQ(ierr); 1000 } else { 1001 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1002 } 1003 PetscFunctionReturn(0); 1004 } 1005 1006 #undef __FUNCT__ 1007 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJCUSPARSE" 1008 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1009 { 1010 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1011 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 1012 CUSPARRAY *xarray,*yarray,*zarray; 1013 PetscErrorCode ierr; 1014 1015 PetscFunctionBegin; 1016 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1017 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1018 if (!cusparseMat->hasTranspose) { 1019 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1020 cusparseMat->hasTranspose=PETSC_TRUE; 1021 } 1022 try { 1023 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 1024 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1025 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 1026 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1027 1028 /* multiply add with matrix transpose */ 1029 ierr = cusparseMat->mat->multiplyAddTranspose(xarray, yarray);CHKERRCUSP(ierr); 1030 1031 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1032 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 1033 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1034 1035 } catch(char *ex) { 1036 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1037 } 1038 ierr = WaitForGPU();CHKERRCUSP(ierr); 1039 if (cusparseMat->isSymmOrHerm) { 1040 ierr = PetscLogFlops(4.0*a->nz - 2.0*cusparseMat->nonzerorow);CHKERRQ(ierr); 1041 } else { 1042 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1043 } 1044 PetscFunctionReturn(0); 1045 } 1046 1047 #undef __FUNCT__ 1048 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE" 1049 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1050 { 1051 PetscErrorCode ierr; 1052 1053 PetscFunctionBegin; 1054 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1055 if (A->factortype==MAT_FACTOR_NONE) { 1056 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1057 } 1058 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1059 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1060 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1061 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1062 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1063 PetscFunctionReturn(0); 1064 } 1065 1066 /* --------------------------------------------------------------------------------*/ 1067 #undef __FUNCT__ 1068 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE" 1069 /*@ 1070 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1071 (the default parallel PETSc format). This matrix will ultimately pushed down 1072 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1073 assembly performance the user should preallocate the matrix storage by setting 1074 the parameter nz (or the array nnz). By setting these parameters accurately, 1075 performance during matrix assembly can be increased by more than a factor of 50. 1076 1077 Collective on MPI_Comm 1078 1079 Input Parameters: 1080 + comm - MPI communicator, set to PETSC_COMM_SELF 1081 . m - number of rows 1082 . n - number of columns 1083 . nz - number of nonzeros per row (same for all rows) 1084 - nnz - array containing the number of nonzeros in the various rows 1085 (possibly different for each row) or NULL 1086 1087 Output Parameter: 1088 . A - the matrix 1089 1090 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1091 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1092 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1093 1094 Notes: 1095 If nnz is given then nz is ignored 1096 1097 The AIJ format (also called the Yale sparse matrix format or 1098 compressed row storage), is fully compatible with standard Fortran 77 1099 storage. That is, the stored row and column indices can begin at 1100 either one (as in Fortran) or zero. See the users' manual for details. 1101 1102 Specify the preallocated storage with either nz or nnz (not both). 1103 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1104 allocation. For large problems you MUST preallocate memory or you 1105 will get TERRIBLE performance, see the users' manual chapter on matrices. 1106 1107 By default, this format uses inodes (identical nodes) when possible, to 1108 improve numerical efficiency of matrix-vector products and solves. We 1109 search for consecutive rows with the same nonzero structure, thereby 1110 reusing matrix information to achieve increased efficiency. 1111 1112 Level: intermediate 1113 1114 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1115 @*/ 1116 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1117 { 1118 PetscErrorCode ierr; 1119 1120 PetscFunctionBegin; 1121 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1122 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1123 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1124 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1125 PetscFunctionReturn(0); 1126 } 1127 1128 #undef __FUNCT__ 1129 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE" 1130 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1131 { 1132 PetscErrorCode ierr; 1133 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 1134 1135 PetscFunctionBegin; 1136 if (A->factortype==MAT_FACTOR_NONE) { 1137 try { 1138 if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) { 1139 delete (GPU_Matrix_Ifc*)(cusparseMat->mat); 1140 } 1141 if (cusparseMat->tempvec!=0) delete cusparseMat->tempvec; 1142 delete cusparseMat; 1143 A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 1144 } catch(char *ex) { 1145 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1146 } 1147 } else { 1148 /* The triangular factors */ 1149 try { 1150 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1151 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 1152 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 1153 delete (GPU_Matrix_Ifc*) cusparseMatLo; 1154 delete (GPU_Matrix_Ifc*) cusparseMatUp; 1155 delete (CUSPARRAY*) cusparseTriFactors->tempvec; 1156 delete cusparseTriFactors; 1157 } catch(char *ex) { 1158 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1159 } 1160 } 1161 if (MAT_cusparseHandle) { 1162 cusparseStatus_t stat; 1163 stat = cusparseDestroy(MAT_cusparseHandle);CHKERRCUSP(stat); 1164 1165 MAT_cusparseHandle=0; 1166 } 1167 /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */ 1168 A->spptr = 0; 1169 1170 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1171 PetscFunctionReturn(0); 1172 } 1173 1174 #undef __FUNCT__ 1175 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE" 1176 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 1177 { 1178 PetscErrorCode ierr; 1179 1180 PetscFunctionBegin; 1181 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 1182 if (B->factortype==MAT_FACTOR_NONE) { 1183 /* you cannot check the inode.use flag here since the matrix was just created. 1184 now build a GPU matrix data structure */ 1185 B->spptr = new Mat_SeqAIJCUSPARSE; 1186 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1187 ((Mat_SeqAIJCUSPARSE*)B->spptr)->tempvec = 0; 1188 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1189 ((Mat_SeqAIJCUSPARSE*)B->spptr)->hasTranspose = PETSC_FALSE; 1190 ((Mat_SeqAIJCUSPARSE*)B->spptr)->isSymmOrHerm = PETSC_FALSE; 1191 } else { 1192 /* NEXT, set the pointers to the triangular factors */ 1193 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1194 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 1195 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1196 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->tempvec = 0; 1197 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->format = cusparseMatSolveStorageFormat; 1198 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->hasTranspose = PETSC_FALSE; 1199 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->isSymmOrHerm = PETSC_FALSE; 1200 } 1201 /* Create a single instance of the MAT_cusparseHandle for any matrix (matMult, TriSolve, ...) */ 1202 if (!MAT_cusparseHandle) { 1203 cusparseStatus_t stat; 1204 stat = cusparseCreate(&MAT_cusparseHandle);CHKERRCUSP(stat); 1205 } 1206 /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the 1207 default cusparse tri solve. Note the difference with the implementation in 1208 MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */ 1209 ierr = PetscObjectComposeFunction((PetscObject)B,"MatGetFactor_petsc_C",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr); 1210 1211 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1212 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1213 B->ops->getvecs = MatGetVecs_SeqAIJCUSPARSE; 1214 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1215 B->ops->mult = MatMult_SeqAIJCUSPARSE; 1216 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1217 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1218 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1219 1220 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1221 1222 B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 1223 1224 ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1225 PetscFunctionReturn(0); 1226 } 1227 1228 /*M 1229 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1230 1231 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1232 CSR, ELL, or Hybrid format. All matrix calculations are performed on Nvidia GPUs using 1233 the CUSPARSE library. This type is only available when using the 'txpetscgpu' package. 1234 Use --download-txpetscgpu to build/install PETSc to use different CUSPARSE library and 1235 the different GPU storage formats. 1236 1237 Options Database Keys: 1238 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1239 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package. 1240 . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package. 1241 - -mat_cusparse_solve_storage_format csr - sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package. 1242 1243 Level: beginner 1244 1245 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1246 M*/ 1247