1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format. 4 */ 5 6 #include "petscconf.h" 7 PETSC_CUDA_EXTERN_C_BEGIN 8 #include "../src/mat/impls/aij/seq/aij.h" /*I "petscmat.h" I*/ 9 //#include "petscbt.h" 10 #include "../src/vec/vec/impls/dvecimpl.h" 11 #include "petsc-private/vecimpl.h" 12 PETSC_CUDA_EXTERN_C_END 13 #undef VecType 14 #include "cusparsematimpl.h" 15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 16 17 /* this is such a hack ... but I don't know of another way to pass this variable 18 from one GPU_Matrix_Ifc class to another. This is necessary for the parallel 19 SpMV. Essentially, I need to use the same stream variable in two different 20 data structures. I do this by creating a single instance of that stream 21 and reuse it. */ 22 cudaStream_t theBodyStream=0; 23 24 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 25 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 26 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 27 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 28 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 29 PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 30 PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 31 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat); 32 PetscErrorCode MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(Mat); 33 PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 34 PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 35 PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 36 PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 37 38 #undef __FUNCT__ 39 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse" 40 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type) 41 { 42 PetscFunctionBegin; 43 *type = MATSOLVERCUSPARSE; 44 PetscFunctionReturn(0); 45 } 46 47 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*); 48 49 /*MC 50 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices on a single GPU of type, 51 seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported algorithms are ILU(k) and LU. ICC(k) and 52 Cholesky will be supported in future versions. This class does NOT support direct solver operations. 53 54 ./configure --download-txpetscgpu to install PETSc to use CUSPARSE 55 56 Consult CUSPARSE documentation for more information about the matrix storage formats which correspond to the options 57 database keys below. 58 59 Options Database Keys: 60 . -mat_cusparse_solve_storage_format csr - sets the storage format matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 61 62 Level: beginner 63 64 .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 65 M*/ 66 67 #undef __FUNCT__ 68 #define __FUNCT__ "MatGetFactor_seqaij_cusparse" 69 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B) 70 { 71 PetscErrorCode ierr; 72 73 PetscFunctionBegin; 74 ierr = MatGetFactor_seqaij_petsc(A,ftype,B);CHKERRQ(ierr); 75 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 76 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 77 ierr = MatSetFromOptions_SeqAIJCUSPARSE(*B);CHKERRQ(ierr); 78 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr); 79 80 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 81 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 82 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 83 (*B)->factortype = ftype; 84 PetscFunctionReturn(0); 85 } 86 87 #undef __FUNCT__ 88 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE" 89 PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 90 { 91 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 92 93 PetscFunctionBegin; 94 switch (op) { 95 case MAT_CUSPARSE_MULT: 96 cusparseMat->format = format; 97 break; 98 case MAT_CUSPARSE_SOLVE: 99 cusparseMatSolveStorageFormat = format; 100 break; 101 case MAT_CUSPARSE_ALL: 102 cusparseMat->format = format; 103 cusparseMatSolveStorageFormat = format; 104 break; 105 default: 106 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL are currently supported.",op); 107 } 108 PetscFunctionReturn(0); 109 } 110 111 112 /*@ 113 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 114 operation. Only the MatMult operation can use different GPU storage formats 115 for MPIAIJCUSPARSE matrices. This requires the txpetscgpu package. Use --download-txpetscgpu 116 to build/install PETSc to use this package. 117 118 Not Collective 119 120 Input Parameters: 121 + A - Matrix of type SEQAIJCUSPARSE 122 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 123 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB) 124 125 Output Parameter: 126 127 Level: intermediate 128 129 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 130 @*/ 131 #undef __FUNCT__ 132 #define __FUNCT__ "MatCUSPARSESetFormat" 133 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 134 { 135 PetscErrorCode ierr; 136 137 PetscFunctionBegin; 138 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 139 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 140 PetscFunctionReturn(0); 141 } 142 143 #undef __FUNCT__ 144 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE" 145 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A) 146 { 147 PetscErrorCode ierr; 148 MatCUSPARSEStorageFormat format; 149 PetscBool flg; 150 151 PetscFunctionBegin; 152 ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr); 153 ierr = PetscObjectOptionsBegin((PetscObject)A); 154 if (A->factortype==MAT_FACTOR_NONE) { 155 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 156 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 157 if (flg) { 158 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 159 } 160 } else { 161 ierr = PetscOptionsEnum("-mat_cusparse_solve_storage_format","sets storage format of (seq)aijcusparse gpu matrices for TriSolve", 162 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 163 if (flg) { 164 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_SOLVE,format);CHKERRQ(ierr); 165 } 166 } 167 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 168 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 169 if (flg) { 170 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 171 } 172 ierr = PetscOptionsEnd();CHKERRQ(ierr); 173 PetscFunctionReturn(0); 174 175 } 176 177 #undef __FUNCT__ 178 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE" 179 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 180 { 181 PetscErrorCode ierr; 182 183 PetscFunctionBegin; 184 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 185 186 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 187 PetscFunctionReturn(0); 188 } 189 190 #undef __FUNCT__ 191 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE" 192 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 193 { 194 PetscErrorCode ierr; 195 196 PetscFunctionBegin; 197 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 198 199 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 200 PetscFunctionReturn(0); 201 } 202 203 #undef __FUNCT__ 204 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildLowerTriMatrix" 205 PetscErrorCode MatSeqAIJCUSPARSEBuildLowerTriMatrix(Mat A) 206 { 207 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 208 PetscInt n = A->rmap->n; 209 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 210 GPU_Matrix_Ifc * cusparseMat = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 211 cusparseStatus_t stat; 212 const PetscInt *ai = a->i,*aj = a->j,*vi; 213 const MatScalar *aa = a->a,*v; 214 PetscErrorCode ierr; 215 PetscInt *AiLo, *AjLo; 216 PetscScalar *AALo; 217 PetscInt i,nz, nzLower, offset, rowOffset; 218 219 PetscFunctionBegin; 220 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 221 try { 222 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 223 nzLower=n+ai[n]-ai[1]; 224 225 /* Allocate Space for the lower triangular matrix */ 226 ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 227 ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr); 228 ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr); 229 230 /* Fill the lower triangular matrix */ 231 AiLo[0] = (PetscInt) 0; 232 AiLo[n] = nzLower; 233 AjLo[0] = (PetscInt) 0; 234 AALo[0] = (MatScalar) 1.0; 235 v = aa; 236 vi = aj; 237 offset = 1; 238 rowOffset= 1; 239 for (i=1; i<n; i++) { 240 nz = ai[i+1] - ai[i]; 241 /* additional 1 for the term on the diagonal */ 242 AiLo[i] = rowOffset; 243 rowOffset += nz+1; 244 245 ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 246 ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 247 248 offset += nz; 249 AjLo[offset] = (PetscInt) i; 250 AALo[offset] = (MatScalar) 1.0; 251 offset += 1; 252 253 v += nz; 254 vi += nz; 255 } 256 cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]); 257 258 stat = cusparseMat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat); 259 ierr = cusparseMat->setMatrix(n, n, nzLower, AiLo, AjLo, AALo);CHKERRCUSP(ierr); 260 stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat); 261 262 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMat; 263 264 ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr); 265 ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr); 266 ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 267 } catch(char *ex) { 268 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 269 } 270 } 271 PetscFunctionReturn(0); 272 } 273 274 #undef __FUNCT__ 275 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildUpperTriMatrix" 276 PetscErrorCode MatSeqAIJCUSPARSEBuildUpperTriMatrix(Mat A) 277 { 278 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 279 PetscInt n = A->rmap->n; 280 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 281 GPU_Matrix_Ifc * cusparseMat = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 282 cusparseStatus_t stat; 283 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 284 const MatScalar *aa = a->a,*v; 285 PetscInt *AiUp, *AjUp; 286 PetscScalar *AAUp; 287 PetscInt i,nz, nzUpper, offset; 288 PetscErrorCode ierr; 289 290 PetscFunctionBegin; 291 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 292 try { 293 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 294 nzUpper = adiag[0]-adiag[n]; 295 296 /* Allocate Space for the upper triangular matrix */ 297 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 298 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 299 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 300 301 /* Fill the upper triangular matrix */ 302 AiUp[0]=(PetscInt) 0; 303 AiUp[n]=nzUpper; 304 offset = nzUpper; 305 for (i=n-1; i>=0; i--) { 306 v = aa + adiag[i+1] + 1; 307 vi = aj + adiag[i+1] + 1; 308 309 /* number of elements NOT on the diagonal */ 310 nz = adiag[i] - adiag[i+1]-1; 311 312 /* decrement the offset */ 313 offset -= (nz+1); 314 315 /* first, set the diagonal elements */ 316 AjUp[offset] = (PetscInt) i; 317 AAUp[offset] = 1./v[nz]; 318 AiUp[i] = AiUp[i+1] - (nz+1); 319 320 ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 321 ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 322 } 323 cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]); 324 325 stat = cusparseMat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 326 ierr = cusparseMat->setMatrix(n, n, nzUpper, AiUp, AjUp, AAUp);CHKERRCUSP(ierr); 327 stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat); 328 329 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMat; 330 331 ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 332 ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 333 ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 334 } catch(char *ex) { 335 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 336 } 337 } 338 PetscFunctionReturn(0); 339 } 340 341 #undef __FUNCT__ 342 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalysisAndCopyToGPU" 343 PetscErrorCode MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(Mat A) 344 { 345 PetscErrorCode ierr; 346 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 347 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 348 IS isrow = a->row,iscol = a->icol; 349 PetscBool row_identity,col_identity; 350 const PetscInt *r,*c; 351 PetscInt n = A->rmap->n; 352 353 PetscFunctionBegin; 354 ierr = MatSeqAIJCUSPARSEBuildLowerTriMatrix(A);CHKERRQ(ierr); 355 ierr = MatSeqAIJCUSPARSEBuildUpperTriMatrix(A);CHKERRQ(ierr); 356 357 cusparseTriFactors->tempvec = new CUSPARRAY; 358 cusparseTriFactors->tempvec->resize(n); 359 360 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 361 /*lower triangular indices */ 362 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 363 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 364 if (!row_identity) { 365 ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(r, n);CHKERRCUSP(ierr); 366 } 367 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 368 369 /*upper triangular indices */ 370 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 371 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 372 if (!col_identity) { 373 ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(c, n);CHKERRCUSP(ierr); 374 } 375 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 376 PetscFunctionReturn(0); 377 } 378 379 #undef __FUNCT__ 380 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE" 381 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 382 { 383 PetscErrorCode ierr; 384 Mat_SeqAIJ *b =(Mat_SeqAIJ*)B->data; 385 IS isrow = b->row,iscol = b->col; 386 PetscBool row_identity,col_identity; 387 388 PetscFunctionBegin; 389 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 390 /* determine which version of MatSolve needs to be used. */ 391 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 392 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 393 if (row_identity && col_identity) { 394 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 395 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 396 } else { 397 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 398 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 399 } 400 401 /* get the triangular factors */ 402 ierr = MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 403 PetscFunctionReturn(0); 404 } 405 406 407 #undef __FUNCT__ 408 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve" 409 PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 410 { 411 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 412 GPU_Matrix_Ifc* cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 413 GPU_Matrix_Ifc* cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 414 cusparseStatus_t stat; 415 PetscFunctionBegin; 416 stat = cusparseMatLo->initializeCusparseMat(MAT_cusparseHandle, 417 CUSPARSE_MATRIX_TYPE_TRIANGULAR, 418 CUSPARSE_FILL_MODE_UPPER, 419 TRANSPOSE);CHKERRCUSP(stat); 420 stat = cusparseMatLo->solveAnalysis(TRANSPOSE);CHKERRCUSP(stat); 421 422 stat = cusparseMatUp->initializeCusparseMat(MAT_cusparseHandle, 423 CUSPARSE_MATRIX_TYPE_TRIANGULAR, 424 CUSPARSE_FILL_MODE_LOWER, 425 TRANSPOSE);CHKERRCUSP(stat); 426 stat = cusparseMatUp->solveAnalysis(TRANSPOSE);CHKERRCUSP(stat); 427 PetscFunctionReturn(0); 428 } 429 430 #undef __FUNCT__ 431 #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult" 432 PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 433 { 434 PetscErrorCode ierr; 435 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 436 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 437 cusparseStatus_t stat; 438 PetscFunctionBegin; 439 stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER, TRANSPOSE);CHKERRCUSP(stat); 440 ierr = cusparseMat->mat->setMatrix(A->rmap->n, A->cmap->n, a->nz, a->i, a->j, a->a, TRANSPOSE);CHKERRCUSP(ierr); 441 PetscFunctionReturn(0); 442 } 443 444 #undef __FUNCT__ 445 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE" 446 PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 447 { 448 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 449 PetscErrorCode ierr; 450 CUSPARRAY *xGPU, *bGPU; 451 cusparseStatus_t stat; 452 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 453 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 454 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 455 CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec; 456 457 PetscFunctionBegin; 458 /* Analyze the matrix ... on the fly */ 459 if (!cusparseTriFactors->hasTranspose) { 460 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 461 cusparseTriFactors->hasTranspose=PETSC_TRUE; 462 } 463 464 /* Get the GPU pointers */ 465 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 466 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 467 468 /* solve with reordering */ 469 ierr = cusparseMatUp->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr); 470 stat = cusparseMatUp->solve(xGPU, tempGPU, TRANSPOSE);CHKERRCUSP(stat); 471 stat = cusparseMatLo->solve(tempGPU, xGPU, TRANSPOSE);CHKERRCUSP(stat); 472 ierr = cusparseMatLo->reorderOut(xGPU);CHKERRCUSP(ierr); 473 474 /* restore */ 475 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 476 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 477 ierr = WaitForGPU();CHKERRCUSP(ierr); 478 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 479 PetscFunctionReturn(0); 480 } 481 482 #undef __FUNCT__ 483 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering" 484 PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 485 { 486 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 487 PetscErrorCode ierr; 488 CUSPARRAY *xGPU, *bGPU; 489 cusparseStatus_t stat; 490 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 491 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 492 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 493 CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec; 494 495 PetscFunctionBegin; 496 /* Analyze the matrix ... on the fly */ 497 if (!cusparseTriFactors->hasTranspose) { 498 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 499 cusparseTriFactors->hasTranspose=PETSC_TRUE; 500 } 501 502 /* Get the GPU pointers */ 503 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 504 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 505 506 /* solve */ 507 stat = cusparseMatUp->solve(bGPU, tempGPU, TRANSPOSE);CHKERRCUSP(stat); 508 stat = cusparseMatLo->solve(tempGPU, xGPU, TRANSPOSE);CHKERRCUSP(stat); 509 510 /* restore */ 511 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 512 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 513 ierr = WaitForGPU();CHKERRCUSP(ierr); 514 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 515 PetscFunctionReturn(0); 516 } 517 518 519 #undef __FUNCT__ 520 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE" 521 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 522 { 523 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 524 PetscErrorCode ierr; 525 CUSPARRAY *xGPU, *bGPU; 526 cusparseStatus_t stat; 527 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 528 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 529 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 530 CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec; 531 532 PetscFunctionBegin; 533 /* Get the GPU pointers */ 534 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 535 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 536 537 /* solve with reordering */ 538 ierr = cusparseMatLo->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr); 539 stat = cusparseMatLo->solve(xGPU, tempGPU);CHKERRCUSP(stat); 540 stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat); 541 ierr = cusparseMatUp->reorderOut(xGPU);CHKERRCUSP(ierr); 542 543 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 544 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 545 ierr = WaitForGPU();CHKERRCUSP(ierr); 546 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 547 PetscFunctionReturn(0); 548 } 549 550 551 552 553 #undef __FUNCT__ 554 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering" 555 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 556 { 557 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 558 PetscErrorCode ierr; 559 CUSPARRAY *xGPU, *bGPU; 560 cusparseStatus_t stat; 561 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 562 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 563 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 564 CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec; 565 566 PetscFunctionBegin; 567 /* Get the GPU pointers */ 568 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 569 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 570 571 /* solve */ 572 stat = cusparseMatLo->solve(bGPU, tempGPU);CHKERRCUSP(stat); 573 stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat); 574 575 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 576 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 577 ierr = WaitForGPU();CHKERRCUSP(ierr); 578 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 579 PetscFunctionReturn(0); 580 } 581 582 #undef __FUNCT__ 583 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU" 584 PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 585 { 586 587 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 588 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 589 PetscInt m = A->rmap->n,*ii,*ridx; 590 PetscErrorCode ierr; 591 592 593 PetscFunctionBegin; 594 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 595 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 596 /* 597 It may be possible to reuse nonzero structure with new matrix values but 598 for simplicity and insured correctness we delete and build a new matrix on 599 the GPU. Likely a very small performance hit. 600 */ 601 if (cusparseMat->mat) { 602 try { 603 delete cusparseMat->mat; 604 if (cusparseMat->tempvec) delete cusparseMat->tempvec; 605 606 } catch(char *ex) { 607 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 608 } 609 } 610 try { 611 cusparseMat->nonzerorow=0; 612 for (int j = 0; j<m; j++) cusparseMat->nonzerorow += ((a->i[j+1]-a->i[j])>0); 613 614 if (a->compressedrow.use) { 615 m = a->compressedrow.nrows; 616 ii = a->compressedrow.i; 617 ridx = a->compressedrow.rindex; 618 } else { 619 /* Forcing compressed row on the GPU ... only relevant for CSR storage */ 620 int k=0; 621 ierr = PetscMalloc((cusparseMat->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr); 622 ierr = PetscMalloc((cusparseMat->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr); 623 ii[0]=0; 624 for (int j = 0; j<m; j++) { 625 if ((a->i[j+1]-a->i[j])>0) { 626 ii[k] = a->i[j]; 627 ridx[k]= j; 628 k++; 629 } 630 } 631 ii[cusparseMat->nonzerorow] = a->nz; 632 633 m = cusparseMat->nonzerorow; 634 } 635 636 /* Build our matrix ... first determine the GPU storage type */ 637 cusparseMat->mat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseMat->format]); 638 639 /* Create the streams and events (if desired). */ 640 PetscMPIInt size; 641 ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr); 642 ierr = cusparseMat->mat->buildStreamsAndEvents(size, &theBodyStream);CHKERRCUSP(ierr); 643 644 /* FILL MODE UPPER is irrelevant */ 645 cusparseStatus_t stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 646 647 /* lastly, build the matrix */ 648 ierr = cusparseMat->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);CHKERRCUSP(ierr); 649 cusparseMat->mat->setCPRowIndices(ridx, m); 650 if (!a->compressedrow.use) { 651 ierr = PetscFree(ii);CHKERRQ(ierr); 652 ierr = PetscFree(ridx);CHKERRQ(ierr); 653 } 654 cusparseMat->tempvec = new CUSPARRAY; 655 cusparseMat->tempvec->resize(m); 656 } catch(char *ex) { 657 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 658 } 659 ierr = WaitForGPU();CHKERRCUSP(ierr); 660 661 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 662 663 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 664 } 665 PetscFunctionReturn(0); 666 } 667 668 #undef __FUNCT__ 669 #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE" 670 PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left) 671 { 672 PetscErrorCode ierr; 673 674 PetscFunctionBegin; 675 if (right) { 676 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 677 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 678 ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr); 679 ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr); 680 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 681 } 682 if (left) { 683 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 684 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 685 ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr); 686 ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr); 687 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 688 } 689 PetscFunctionReturn(0); 690 } 691 692 #undef __FUNCT__ 693 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE" 694 PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 695 { 696 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 697 PetscErrorCode ierr; 698 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 699 CUSPARRAY *xarray,*yarray; 700 701 PetscFunctionBegin; 702 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 703 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 704 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 705 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 706 try { 707 ierr = cusparseMat->mat->multiply(xarray, yarray);CHKERRCUSP(ierr); 708 } catch (char *ex) { 709 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 710 } 711 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 712 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 713 if (!cusparseMat->mat->hasNonZeroStream()) { 714 ierr = WaitForGPU();CHKERRCUSP(ierr); 715 } 716 ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr); 717 PetscFunctionReturn(0); 718 } 719 720 721 #undef __FUNCT__ 722 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE" 723 PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 724 { 725 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 726 PetscErrorCode ierr; 727 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 728 CUSPARRAY *xarray,*yarray; 729 730 PetscFunctionBegin; 731 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 732 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 733 if (!cusparseMat->hasTranspose) { 734 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 735 cusparseMat->hasTranspose=PETSC_TRUE; 736 } 737 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 738 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 739 try { 740 ierr = cusparseMat->mat->multiply(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr); 741 } catch (char *ex) { 742 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 743 } 744 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 745 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 746 if (!cusparseMat->mat->hasNonZeroStream()) { 747 ierr = WaitForGPU();CHKERRCUSP(ierr); 748 } 749 ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr); 750 PetscFunctionReturn(0); 751 } 752 753 #undef __FUNCT__ 754 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 755 PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 756 { 757 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 758 PetscErrorCode ierr; 759 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 760 CUSPARRAY *xarray,*yarray,*zarray; 761 762 PetscFunctionBegin; 763 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 764 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 765 try { 766 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 767 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 768 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 769 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 770 771 /* multiply add */ 772 ierr = cusparseMat->mat->multiplyAdd(xarray, zarray);CHKERRCUSP(ierr); 773 774 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 775 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 776 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 777 778 } catch(char *ex) { 779 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 780 } 781 ierr = WaitForGPU();CHKERRCUSP(ierr); 782 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 783 PetscFunctionReturn(0); 784 } 785 786 #undef __FUNCT__ 787 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 788 PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 789 { 790 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 791 PetscErrorCode ierr; 792 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 793 CUSPARRAY *xarray,*yarray,*zarray; 794 795 PetscFunctionBegin; 796 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 797 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 798 if (!cusparseMat->hasTranspose) { 799 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 800 cusparseMat->hasTranspose=PETSC_TRUE; 801 } 802 try { 803 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 804 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 805 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 806 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 807 808 /* multiply add with matrix transpose */ 809 ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr); 810 811 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 812 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 813 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 814 815 } catch(char *ex) { 816 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 817 } 818 ierr = WaitForGPU();CHKERRCUSP(ierr); 819 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 820 PetscFunctionReturn(0); 821 } 822 823 #undef __FUNCT__ 824 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE" 825 PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 826 { 827 PetscErrorCode ierr; 828 829 PetscFunctionBegin; 830 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 831 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 832 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 833 A->ops->mult = MatMult_SeqAIJCUSPARSE; 834 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 835 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 836 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 837 PetscFunctionReturn(0); 838 } 839 840 /* --------------------------------------------------------------------------------*/ 841 #undef __FUNCT__ 842 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE" 843 /*@ 844 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 845 (the default parallel PETSc format). This matrix will ultimately pushed down 846 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 847 assembly performance the user should preallocate the matrix storage by setting 848 the parameter nz (or the array nnz). By setting these parameters accurately, 849 performance during matrix assembly can be increased by more than a factor of 50. 850 851 Collective on MPI_Comm 852 853 Input Parameters: 854 + comm - MPI communicator, set to PETSC_COMM_SELF 855 . m - number of rows 856 . n - number of columns 857 . nz - number of nonzeros per row (same for all rows) 858 - nnz - array containing the number of nonzeros in the various rows 859 (possibly different for each row) or NULL 860 861 Output Parameter: 862 . A - the matrix 863 864 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 865 MatXXXXSetPreallocation() paradgm instead of this routine directly. 866 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 867 868 Notes: 869 If nnz is given then nz is ignored 870 871 The AIJ format (also called the Yale sparse matrix format or 872 compressed row storage), is fully compatible with standard Fortran 77 873 storage. That is, the stored row and column indices can begin at 874 either one (as in Fortran) or zero. See the users' manual for details. 875 876 Specify the preallocated storage with either nz or nnz (not both). 877 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 878 allocation. For large problems you MUST preallocate memory or you 879 will get TERRIBLE performance, see the users' manual chapter on matrices. 880 881 By default, this format uses inodes (identical nodes) when possible, to 882 improve numerical efficiency of matrix-vector products and solves. We 883 search for consecutive rows with the same nonzero structure, thereby 884 reusing matrix information to achieve increased efficiency. 885 886 Level: intermediate 887 888 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 889 @*/ 890 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 891 { 892 PetscErrorCode ierr; 893 894 PetscFunctionBegin; 895 ierr = MatCreate(comm,A);CHKERRQ(ierr); 896 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 897 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 898 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 899 PetscFunctionReturn(0); 900 } 901 902 #undef __FUNCT__ 903 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE" 904 PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 905 { 906 PetscErrorCode ierr; 907 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 908 909 PetscFunctionBegin; 910 if (A->factortype==MAT_FACTOR_NONE) { 911 try { 912 if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) { 913 delete (GPU_Matrix_Ifc*)(cusparseMat->mat); 914 } 915 if (cusparseMat->tempvec!=0) delete cusparseMat->tempvec; 916 delete cusparseMat; 917 A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 918 } catch(char *ex) { 919 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 920 } 921 } else { 922 /* The triangular factors */ 923 try { 924 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 925 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 926 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 927 delete (GPU_Matrix_Ifc*) cusparseMatLo; 928 delete (GPU_Matrix_Ifc*) cusparseMatUp; 929 delete (CUSPARRAY*) cusparseTriFactors->tempvec; 930 delete cusparseTriFactors; 931 } catch(char *ex) { 932 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 933 } 934 } 935 if (MAT_cusparseHandle) { 936 cusparseStatus_t stat; 937 stat = cusparseDestroy(MAT_cusparseHandle);CHKERRCUSP(stat); 938 939 MAT_cusparseHandle=0; 940 } 941 /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */ 942 A->spptr = 0; 943 944 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 945 PetscFunctionReturn(0); 946 } 947 948 #undef __FUNCT__ 949 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE" 950 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 951 { 952 PetscErrorCode ierr; 953 954 PetscFunctionBegin; 955 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 956 if (B->factortype==MAT_FACTOR_NONE) { 957 /* you cannot check the inode.use flag here since the matrix was just created. 958 now build a GPU matrix data structure */ 959 B->spptr = new Mat_SeqAIJCUSPARSE; 960 961 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 962 ((Mat_SeqAIJCUSPARSE*)B->spptr)->tempvec = 0; 963 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 964 ((Mat_SeqAIJCUSPARSE*)B->spptr)->hasTranspose = PETSC_FALSE; 965 } else { 966 /* NEXT, set the pointers to the triangular factors */ 967 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 968 969 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 970 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 971 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->tempvec = 0; 972 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->format = cusparseMatSolveStorageFormat; 973 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->hasTranspose = PETSC_FALSE; 974 } 975 /* Create a single instance of the MAT_cusparseHandle for any matrix (matMult, TriSolve, ...) */ 976 if (!MAT_cusparseHandle) { 977 cusparseStatus_t stat; 978 stat = cusparseCreate(&MAT_cusparseHandle);CHKERRCUSP(stat); 979 } 980 /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the 981 default cusparse tri solve. Note the difference with the implementation in 982 MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */ 983 ierr = PetscObjectComposeFunction((PetscObject)B,"MatGetFactor_petsc_C",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr); 984 985 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 986 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 987 B->ops->getvecs = MatGetVecs_SeqAIJCUSPARSE; 988 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 989 B->ops->mult = MatMult_SeqAIJCUSPARSE; 990 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 991 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 992 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 993 994 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 995 996 B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 997 998 ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 999 PetscFunctionReturn(0); 1000 } 1001 1002 /*M 1003 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1004 1005 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1006 CSR, ELL, or Hybrid format. All matrix calculations are performed on Nvidia GPUs using 1007 the CUSPARSE library. This type is only available when using the 'txpetscgpu' package. 1008 Use --download-txpetscgpu to build/install PETSc to use different CUSPARSE library and 1009 the different GPU storage formats. 1010 1011 Options Database Keys: 1012 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1013 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package. 1014 . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package. 1015 - -mat_cusparse_solve_storage_format csr - sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package. 1016 1017 Level: beginner 1018 1019 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1020 M*/ 1021