1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format. 4 */ 5 6 #include "petscconf.h" 7 PETSC_CUDA_EXTERN_C_BEGIN 8 #include "../src/mat/impls/aij/seq/aij.h" /*I "petscmat.h" I*/ 9 //#include "petscbt.h" 10 #include "../src/vec/vec/impls/dvecimpl.h" 11 #include "petsc-private/vecimpl.h" 12 PETSC_CUDA_EXTERN_C_END 13 #undef VecType 14 #include "cusparsematimpl.h" 15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 16 17 /* this is such a hack ... but I don't know of another way to pass this variable 18 from one GPU_Matrix_Ifc class to another. This is necessary for the parallel 19 SpMV. Essentially, I need to use the same stream variable in two different 20 data structures. I do this by creating a single instance of that stream 21 and reuse it. */ 22 cudaStream_t theBodyStream=0; 23 24 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 25 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 26 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 27 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 28 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 29 PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 30 PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 31 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat); 32 PetscErrorCode MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(Mat); 33 PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 34 PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 35 PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 36 PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 37 38 #undef __FUNCT__ 39 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse" 40 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type) 41 { 42 PetscFunctionBegin; 43 *type = MATSOLVERCUSPARSE; 44 PetscFunctionReturn(0); 45 } 46 47 EXTERN_C_BEGIN 48 extern PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*); 49 EXTERN_C_END 50 /* 51 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers (ILU) for seq matrices 52 on the GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp 53 54 Level: beginner 55 */ 56 57 EXTERN_C_BEGIN 58 #undef __FUNCT__ 59 #define __FUNCT__ "MatGetFactor_seqaij_cusparse" 60 PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B) 61 { 62 PetscErrorCode ierr; 63 64 PetscFunctionBegin; 65 ierr = MatGetFactor_seqaij_petsc(A,ftype,B);CHKERRQ(ierr); 66 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 67 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 68 ierr = MatSetFromOptions_SeqAIJCUSPARSE(*B);CHKERRQ(ierr); 69 ierr = PetscObjectComposeFunctionDynamic((PetscObject)(*B),"MatFactorGetSolverPackage_C","MatFactorGetSolverPackage_seqaij_cusparse",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr); 70 71 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 72 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 73 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 74 (*B)->factortype = ftype; 75 PetscFunctionReturn(0); 76 } 77 EXTERN_C_END 78 79 EXTERN_C_BEGIN 80 #undef __FUNCT__ 81 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE" 82 PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 83 { 84 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 85 86 PetscFunctionBegin; 87 switch (op) { 88 case MAT_CUSPARSE_MULT: 89 cusparseMat->format = format; 90 break; 91 case MAT_CUSPARSE_SOLVE: 92 cusparseMatSolveStorageFormat = format; 93 break; 94 case MAT_CUSPARSE_ALL: 95 cusparseMat->format = format; 96 cusparseMatSolveStorageFormat = format; 97 break; 98 default: 99 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL are currently supported.",op); 100 } 101 PetscFunctionReturn(0); 102 } 103 EXTERN_C_END 104 105 106 /*@ 107 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 108 operation. Only the MatMult operation can use different GPU storage formats 109 for MPIAIJCUSPARSE matrices. This requires the txpetscgpu package. Use --download-txpetscgpu 110 to build/install PETSc to use this package. 111 112 Not Collective 113 114 Input Parameters: 115 + A - Matrix of type SEQAIJCUSPARSE 116 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 117 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB) 118 119 Output Parameter: 120 121 Level: intermediate 122 123 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 124 @*/ 125 #undef __FUNCT__ 126 #define __FUNCT__ "MatCUSPARSESetFormat" 127 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 128 { 129 PetscErrorCode ierr; 130 131 PetscFunctionBegin; 132 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 133 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 134 PetscFunctionReturn(0); 135 } 136 137 #undef __FUNCT__ 138 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE" 139 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A) 140 { 141 PetscErrorCode ierr; 142 MatCUSPARSEStorageFormat format; 143 PetscBool flg; 144 145 PetscFunctionBegin; 146 ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr); 147 ierr = PetscObjectOptionsBegin((PetscObject)A); 148 if (A->factortype==MAT_FACTOR_NONE) { 149 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 150 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 151 if (flg) { 152 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 153 } 154 } else { 155 ierr = PetscOptionsEnum("-mat_cusparse_solve_storage_format","sets storage format of (seq)aijcusparse gpu matrices for TriSolve", 156 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 157 if (flg) { 158 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_SOLVE,format);CHKERRQ(ierr); 159 } 160 } 161 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 162 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 163 if (flg) { 164 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 165 } 166 ierr = PetscOptionsEnd();CHKERRQ(ierr); 167 PetscFunctionReturn(0); 168 169 } 170 171 #undef __FUNCT__ 172 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE" 173 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 174 { 175 PetscErrorCode ierr; 176 177 PetscFunctionBegin; 178 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 179 180 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 181 PetscFunctionReturn(0); 182 } 183 184 #undef __FUNCT__ 185 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE" 186 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 187 { 188 PetscErrorCode ierr; 189 190 PetscFunctionBegin; 191 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 192 193 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 194 PetscFunctionReturn(0); 195 } 196 197 #undef __FUNCT__ 198 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildLowerTriMatrix" 199 PetscErrorCode MatSeqAIJCUSPARSEBuildLowerTriMatrix(Mat A) 200 { 201 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 202 PetscInt n = A->rmap->n; 203 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 204 GPU_Matrix_Ifc * cusparseMat = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 205 cusparseStatus_t stat; 206 const PetscInt *ai = a->i,*aj = a->j,*vi; 207 const MatScalar *aa = a->a,*v; 208 PetscErrorCode ierr; 209 PetscInt *AiLo, *AjLo; 210 PetscScalar *AALo; 211 PetscInt i,nz, nzLower, offset, rowOffset; 212 213 PetscFunctionBegin; 214 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 215 try { 216 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 217 nzLower=n+ai[n]-ai[1]; 218 219 /* Allocate Space for the lower triangular matrix */ 220 ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 221 ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr); 222 ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr); 223 224 /* Fill the lower triangular matrix */ 225 AiLo[0] = (PetscInt) 0; 226 AiLo[n] = nzLower; 227 AjLo[0] = (PetscInt) 0; 228 AALo[0] = (MatScalar) 1.0; 229 v = aa; 230 vi = aj; 231 offset = 1; 232 rowOffset= 1; 233 for (i=1; i<n; i++) { 234 nz = ai[i+1] - ai[i]; 235 /* additional 1 for the term on the diagonal */ 236 AiLo[i] = rowOffset; 237 rowOffset += nz+1; 238 239 ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 240 ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 241 242 offset += nz; 243 AjLo[offset] = (PetscInt) i; 244 AALo[offset] = (MatScalar) 1.0; 245 offset += 1; 246 247 v += nz; 248 vi += nz; 249 } 250 cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]); 251 252 stat = cusparseMat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat); 253 ierr = cusparseMat->setMatrix(n, n, nzLower, AiLo, AjLo, AALo);CHKERRCUSP(ierr); 254 stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat); 255 256 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMat; 257 258 ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr); 259 ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr); 260 ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 261 } catch(char *ex) { 262 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 263 } 264 } 265 PetscFunctionReturn(0); 266 } 267 268 #undef __FUNCT__ 269 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildUpperTriMatrix" 270 PetscErrorCode MatSeqAIJCUSPARSEBuildUpperTriMatrix(Mat A) 271 { 272 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 273 PetscInt n = A->rmap->n; 274 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 275 GPU_Matrix_Ifc * cusparseMat = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 276 cusparseStatus_t stat; 277 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 278 const MatScalar *aa = a->a,*v; 279 PetscInt *AiUp, *AjUp; 280 PetscScalar *AAUp; 281 PetscInt i,nz, nzUpper, offset; 282 PetscErrorCode ierr; 283 284 PetscFunctionBegin; 285 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 286 try { 287 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 288 nzUpper = adiag[0]-adiag[n]; 289 290 /* Allocate Space for the upper triangular matrix */ 291 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 292 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 293 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 294 295 /* Fill the upper triangular matrix */ 296 AiUp[0]=(PetscInt) 0; 297 AiUp[n]=nzUpper; 298 offset = nzUpper; 299 for (i=n-1; i>=0; i--) { 300 v = aa + adiag[i+1] + 1; 301 vi = aj + adiag[i+1] + 1; 302 303 /* number of elements NOT on the diagonal */ 304 nz = adiag[i] - adiag[i+1]-1; 305 306 /* decrement the offset */ 307 offset -= (nz+1); 308 309 /* first, set the diagonal elements */ 310 AjUp[offset] = (PetscInt) i; 311 AAUp[offset] = 1./v[nz]; 312 AiUp[i] = AiUp[i+1] - (nz+1); 313 314 ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 315 ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 316 } 317 cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]); 318 319 stat = cusparseMat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 320 ierr = cusparseMat->setMatrix(n, n, nzUpper, AiUp, AjUp, AAUp);CHKERRCUSP(ierr); 321 stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat); 322 323 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMat; 324 325 ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 326 ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 327 ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 328 } catch(char *ex) { 329 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 330 } 331 } 332 PetscFunctionReturn(0); 333 } 334 335 #undef __FUNCT__ 336 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalysisAndCopyToGPU" 337 PetscErrorCode MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(Mat A) 338 { 339 PetscErrorCode ierr; 340 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 341 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 342 IS isrow = a->row,iscol = a->icol; 343 PetscBool row_identity,col_identity; 344 const PetscInt *r,*c; 345 PetscInt n = A->rmap->n; 346 347 PetscFunctionBegin; 348 ierr = MatSeqAIJCUSPARSEBuildLowerTriMatrix(A);CHKERRQ(ierr); 349 ierr = MatSeqAIJCUSPARSEBuildUpperTriMatrix(A);CHKERRQ(ierr); 350 351 cusparseTriFactors->tempvec = new CUSPARRAY; 352 cusparseTriFactors->tempvec->resize(n); 353 354 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 355 /*lower triangular indices */ 356 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 357 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 358 if (!row_identity) { 359 ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(r, n);CHKERRCUSP(ierr); 360 } 361 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 362 363 /*upper triangular indices */ 364 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 365 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 366 if (!col_identity) { 367 ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(c, n);CHKERRCUSP(ierr); 368 } 369 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 370 PetscFunctionReturn(0); 371 } 372 373 #undef __FUNCT__ 374 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE" 375 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 376 { 377 PetscErrorCode ierr; 378 Mat_SeqAIJ *b =(Mat_SeqAIJ*)B->data; 379 IS isrow = b->row,iscol = b->col; 380 PetscBool row_identity,col_identity; 381 382 PetscFunctionBegin; 383 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 384 /* determine which version of MatSolve needs to be used. */ 385 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 386 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 387 if (row_identity && col_identity) { 388 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 389 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 390 } else { 391 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 392 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 393 } 394 395 /* get the triangular factors */ 396 ierr = MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 397 PetscFunctionReturn(0); 398 } 399 400 401 #undef __FUNCT__ 402 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve" 403 PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 404 { 405 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 406 GPU_Matrix_Ifc* cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 407 GPU_Matrix_Ifc* cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 408 cusparseStatus_t stat; 409 PetscFunctionBegin; 410 stat = cusparseMatLo->initializeCusparseMat(MAT_cusparseHandle, 411 CUSPARSE_MATRIX_TYPE_TRIANGULAR, 412 CUSPARSE_FILL_MODE_UPPER, 413 TRANSPOSE);CHKERRCUSP(stat); 414 stat = cusparseMatLo->solveAnalysis(TRANSPOSE);CHKERRCUSP(stat); 415 416 stat = cusparseMatUp->initializeCusparseMat(MAT_cusparseHandle, 417 CUSPARSE_MATRIX_TYPE_TRIANGULAR, 418 CUSPARSE_FILL_MODE_LOWER, 419 TRANSPOSE);CHKERRCUSP(stat); 420 stat = cusparseMatUp->solveAnalysis(TRANSPOSE);CHKERRCUSP(stat); 421 PetscFunctionReturn(0); 422 } 423 424 #undef __FUNCT__ 425 #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult" 426 PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 427 { 428 PetscErrorCode ierr; 429 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 430 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 431 cusparseStatus_t stat; 432 PetscFunctionBegin; 433 stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER, TRANSPOSE);CHKERRCUSP(stat); 434 ierr = cusparseMat->mat->setMatrix(A->rmap->n, A->cmap->n, a->nz, a->i, a->j, a->a, TRANSPOSE);CHKERRCUSP(ierr); 435 PetscFunctionReturn(0); 436 } 437 438 #undef __FUNCT__ 439 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE" 440 PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 441 { 442 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 443 PetscErrorCode ierr; 444 CUSPARRAY *xGPU, *bGPU; 445 cusparseStatus_t stat; 446 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 447 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 448 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 449 CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec; 450 451 PetscFunctionBegin; 452 /* Analyze the matrix ... on the fly */ 453 if (!cusparseTriFactors->hasTranspose) { 454 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 455 cusparseTriFactors->hasTranspose=PETSC_TRUE; 456 } 457 458 /* Get the GPU pointers */ 459 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 460 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 461 462 /* solve with reordering */ 463 ierr = cusparseMatUp->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr); 464 stat = cusparseMatUp->solve(xGPU, tempGPU, TRANSPOSE);CHKERRCUSP(stat); 465 stat = cusparseMatLo->solve(tempGPU, xGPU, TRANSPOSE);CHKERRCUSP(stat); 466 ierr = cusparseMatLo->reorderOut(xGPU);CHKERRCUSP(ierr); 467 468 /* restore */ 469 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 470 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 471 ierr = WaitForGPU();CHKERRCUSP(ierr); 472 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 473 PetscFunctionReturn(0); 474 } 475 476 #undef __FUNCT__ 477 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering" 478 PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 479 { 480 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 481 PetscErrorCode ierr; 482 CUSPARRAY *xGPU, *bGPU; 483 cusparseStatus_t stat; 484 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 485 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 486 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 487 CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec; 488 489 PetscFunctionBegin; 490 /* Analyze the matrix ... on the fly */ 491 if (!cusparseTriFactors->hasTranspose) { 492 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 493 cusparseTriFactors->hasTranspose=PETSC_TRUE; 494 } 495 496 /* Get the GPU pointers */ 497 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 498 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 499 500 /* solve */ 501 stat = cusparseMatUp->solve(bGPU, tempGPU, TRANSPOSE);CHKERRCUSP(stat); 502 stat = cusparseMatLo->solve(tempGPU, xGPU, TRANSPOSE);CHKERRCUSP(stat); 503 504 /* restore */ 505 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 506 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 507 ierr = WaitForGPU();CHKERRCUSP(ierr); 508 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 509 PetscFunctionReturn(0); 510 } 511 512 513 #undef __FUNCT__ 514 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE" 515 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 516 { 517 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 518 PetscErrorCode ierr; 519 CUSPARRAY *xGPU, *bGPU; 520 cusparseStatus_t stat; 521 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 522 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 523 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 524 CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec; 525 526 PetscFunctionBegin; 527 /* Get the GPU pointers */ 528 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 529 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 530 531 /* solve with reordering */ 532 ierr = cusparseMatLo->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr); 533 stat = cusparseMatLo->solve(xGPU, tempGPU);CHKERRCUSP(stat); 534 stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat); 535 ierr = cusparseMatUp->reorderOut(xGPU);CHKERRCUSP(ierr); 536 537 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 538 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 539 ierr = WaitForGPU();CHKERRCUSP(ierr); 540 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 541 PetscFunctionReturn(0); 542 } 543 544 545 546 547 #undef __FUNCT__ 548 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering" 549 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 550 { 551 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 552 PetscErrorCode ierr; 553 CUSPARRAY *xGPU, *bGPU; 554 cusparseStatus_t stat; 555 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 556 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 557 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 558 CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec; 559 560 PetscFunctionBegin; 561 /* Get the GPU pointers */ 562 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 563 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 564 565 /* solve */ 566 stat = cusparseMatLo->solve(bGPU, tempGPU);CHKERRCUSP(stat); 567 stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat); 568 569 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 570 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 571 ierr = WaitForGPU();CHKERRCUSP(ierr); 572 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 573 PetscFunctionReturn(0); 574 } 575 576 #undef __FUNCT__ 577 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU" 578 PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 579 { 580 581 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 582 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 583 PetscInt m = A->rmap->n,*ii,*ridx; 584 PetscErrorCode ierr; 585 586 587 PetscFunctionBegin; 588 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 589 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 590 /* 591 It may be possible to reuse nonzero structure with new matrix values but 592 for simplicity and insured correctness we delete and build a new matrix on 593 the GPU. Likely a very small performance hit. 594 */ 595 if (cusparseMat->mat) { 596 try { 597 delete cusparseMat->mat; 598 if (cusparseMat->tempvec) delete cusparseMat->tempvec; 599 600 } catch(char *ex) { 601 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 602 } 603 } 604 try { 605 cusparseMat->nonzerorow=0; 606 for (int j = 0; j<m; j++) cusparseMat->nonzerorow += ((a->i[j+1]-a->i[j])>0); 607 608 if (a->compressedrow.use) { 609 m = a->compressedrow.nrows; 610 ii = a->compressedrow.i; 611 ridx = a->compressedrow.rindex; 612 } else { 613 /* Forcing compressed row on the GPU ... only relevant for CSR storage */ 614 int k=0; 615 ierr = PetscMalloc((cusparseMat->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr); 616 ierr = PetscMalloc((cusparseMat->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr); 617 ii[0]=0; 618 for (int j = 0; j<m; j++) { 619 if ((a->i[j+1]-a->i[j])>0) { 620 ii[k] = a->i[j]; 621 ridx[k]= j; 622 k++; 623 } 624 } 625 ii[cusparseMat->nonzerorow] = a->nz; 626 627 m = cusparseMat->nonzerorow; 628 } 629 630 /* Build our matrix ... first determine the GPU storage type */ 631 cusparseMat->mat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseMat->format]); 632 633 /* Create the streams and events (if desired). */ 634 PetscMPIInt size; 635 ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr); 636 ierr = cusparseMat->mat->buildStreamsAndEvents(size, &theBodyStream);CHKERRCUSP(ierr); 637 638 /* FILL MODE UPPER is irrelevant */ 639 cusparseStatus_t stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 640 641 /* lastly, build the matrix */ 642 ierr = cusparseMat->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);CHKERRCUSP(ierr); 643 cusparseMat->mat->setCPRowIndices(ridx, m); 644 if (!a->compressedrow.use) { 645 ierr = PetscFree(ii);CHKERRQ(ierr); 646 ierr = PetscFree(ridx);CHKERRQ(ierr); 647 } 648 cusparseMat->tempvec = new CUSPARRAY; 649 cusparseMat->tempvec->resize(m); 650 } catch(char *ex) { 651 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 652 } 653 ierr = WaitForGPU();CHKERRCUSP(ierr); 654 655 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 656 657 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 658 } 659 PetscFunctionReturn(0); 660 } 661 662 #undef __FUNCT__ 663 #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE" 664 PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left) 665 { 666 PetscErrorCode ierr; 667 668 PetscFunctionBegin; 669 if (right) { 670 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 671 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 672 ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr); 673 ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr); 674 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 675 } 676 if (left) { 677 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 678 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 679 ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr); 680 ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr); 681 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 682 } 683 PetscFunctionReturn(0); 684 } 685 686 #undef __FUNCT__ 687 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE" 688 PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 689 { 690 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 691 PetscErrorCode ierr; 692 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 693 CUSPARRAY *xarray,*yarray; 694 695 PetscFunctionBegin; 696 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 697 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 698 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 699 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 700 try { 701 ierr = cusparseMat->mat->multiply(xarray, yarray);CHKERRCUSP(ierr); 702 } catch (char *ex) { 703 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 704 } 705 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 706 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 707 if (!cusparseMat->mat->hasNonZeroStream()) { 708 ierr = WaitForGPU();CHKERRCUSP(ierr); 709 } 710 ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr); 711 PetscFunctionReturn(0); 712 } 713 714 715 #undef __FUNCT__ 716 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE" 717 PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 718 { 719 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 720 PetscErrorCode ierr; 721 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 722 CUSPARRAY *xarray,*yarray; 723 724 PetscFunctionBegin; 725 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 726 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 727 if (!cusparseMat->hasTranspose) { 728 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 729 cusparseMat->hasTranspose=PETSC_TRUE; 730 } 731 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 732 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 733 try { 734 ierr = cusparseMat->mat->multiply(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr); 735 } catch (char *ex) { 736 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 737 } 738 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 739 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 740 if (!cusparseMat->mat->hasNonZeroStream()) { 741 ierr = WaitForGPU();CHKERRCUSP(ierr); 742 } 743 ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr); 744 PetscFunctionReturn(0); 745 } 746 747 #undef __FUNCT__ 748 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 749 PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 750 { 751 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 752 PetscErrorCode ierr; 753 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 754 CUSPARRAY *xarray,*yarray,*zarray; 755 756 PetscFunctionBegin; 757 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 758 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 759 try { 760 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 761 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 762 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 763 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 764 765 /* multiply add */ 766 ierr = cusparseMat->mat->multiplyAdd(xarray, zarray);CHKERRCUSP(ierr); 767 768 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 769 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 770 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 771 772 } catch(char *ex) { 773 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 774 } 775 ierr = WaitForGPU();CHKERRCUSP(ierr); 776 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 777 PetscFunctionReturn(0); 778 } 779 780 #undef __FUNCT__ 781 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 782 PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 783 { 784 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 785 PetscErrorCode ierr; 786 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 787 CUSPARRAY *xarray,*yarray,*zarray; 788 789 PetscFunctionBegin; 790 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 791 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 792 if (!cusparseMat->hasTranspose) { 793 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 794 cusparseMat->hasTranspose=PETSC_TRUE; 795 } 796 try { 797 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 798 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 799 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 800 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 801 802 /* multiply add with matrix transpose */ 803 ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr); 804 805 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 806 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 807 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 808 809 } catch(char *ex) { 810 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 811 } 812 ierr = WaitForGPU();CHKERRCUSP(ierr); 813 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 814 PetscFunctionReturn(0); 815 } 816 817 #undef __FUNCT__ 818 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE" 819 PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 820 { 821 PetscErrorCode ierr; 822 823 PetscFunctionBegin; 824 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 825 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 826 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 827 A->ops->mult = MatMult_SeqAIJCUSPARSE; 828 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 829 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 830 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 831 PetscFunctionReturn(0); 832 } 833 834 /* --------------------------------------------------------------------------------*/ 835 #undef __FUNCT__ 836 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE" 837 /*@ 838 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 839 (the default parallel PETSc format). This matrix will ultimately pushed down 840 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 841 assembly performance the user should preallocate the matrix storage by setting 842 the parameter nz (or the array nnz). By setting these parameters accurately, 843 performance during matrix assembly can be increased by more than a factor of 50. 844 845 Collective on MPI_Comm 846 847 Input Parameters: 848 + comm - MPI communicator, set to PETSC_COMM_SELF 849 . m - number of rows 850 . n - number of columns 851 . nz - number of nonzeros per row (same for all rows) 852 - nnz - array containing the number of nonzeros in the various rows 853 (possibly different for each row) or NULL 854 855 Output Parameter: 856 . A - the matrix 857 858 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 859 MatXXXXSetPreallocation() paradgm instead of this routine directly. 860 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 861 862 Notes: 863 If nnz is given then nz is ignored 864 865 The AIJ format (also called the Yale sparse matrix format or 866 compressed row storage), is fully compatible with standard Fortran 77 867 storage. That is, the stored row and column indices can begin at 868 either one (as in Fortran) or zero. See the users' manual for details. 869 870 Specify the preallocated storage with either nz or nnz (not both). 871 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 872 allocation. For large problems you MUST preallocate memory or you 873 will get TERRIBLE performance, see the users' manual chapter on matrices. 874 875 By default, this format uses inodes (identical nodes) when possible, to 876 improve numerical efficiency of matrix-vector products and solves. We 877 search for consecutive rows with the same nonzero structure, thereby 878 reusing matrix information to achieve increased efficiency. 879 880 Level: intermediate 881 882 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 883 @*/ 884 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 885 { 886 PetscErrorCode ierr; 887 888 PetscFunctionBegin; 889 ierr = MatCreate(comm,A);CHKERRQ(ierr); 890 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 891 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 892 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 893 PetscFunctionReturn(0); 894 } 895 896 #undef __FUNCT__ 897 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE" 898 PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 899 { 900 PetscErrorCode ierr; 901 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 902 903 PetscFunctionBegin; 904 if (A->factortype==MAT_FACTOR_NONE) { 905 try { 906 if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) { 907 delete (GPU_Matrix_Ifc*)(cusparseMat->mat); 908 } 909 if (cusparseMat->tempvec!=0) delete cusparseMat->tempvec; 910 delete cusparseMat; 911 A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 912 } catch(char *ex) { 913 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 914 } 915 } else { 916 /* The triangular factors */ 917 try { 918 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 919 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 920 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 921 delete (GPU_Matrix_Ifc*) cusparseMatLo; 922 delete (GPU_Matrix_Ifc*) cusparseMatUp; 923 delete (CUSPARRAY*) cusparseTriFactors->tempvec; 924 delete cusparseTriFactors; 925 } catch(char *ex) { 926 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 927 } 928 } 929 if (MAT_cusparseHandle) { 930 cusparseStatus_t stat; 931 stat = cusparseDestroy(MAT_cusparseHandle);CHKERRCUSP(stat); 932 933 MAT_cusparseHandle=0; 934 } 935 /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */ 936 A->spptr = 0; 937 938 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 939 PetscFunctionReturn(0); 940 } 941 942 EXTERN_C_BEGIN 943 #undef __FUNCT__ 944 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE" 945 PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 946 { 947 PetscErrorCode ierr; 948 949 PetscFunctionBegin; 950 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 951 if (B->factortype==MAT_FACTOR_NONE) { 952 /* you cannot check the inode.use flag here since the matrix was just created. 953 now build a GPU matrix data structure */ 954 B->spptr = new Mat_SeqAIJCUSPARSE; 955 956 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 957 ((Mat_SeqAIJCUSPARSE*)B->spptr)->tempvec = 0; 958 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 959 ((Mat_SeqAIJCUSPARSE*)B->spptr)->hasTranspose = PETSC_FALSE; 960 } else { 961 /* NEXT, set the pointers to the triangular factors */ 962 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 963 964 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 965 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 966 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->tempvec = 0; 967 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->format = cusparseMatSolveStorageFormat; 968 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->hasTranspose = PETSC_FALSE; 969 } 970 /* Create a single instance of the MAT_cusparseHandle for any matrix (matMult, TriSolve, ...) */ 971 if (!MAT_cusparseHandle) { 972 cusparseStatus_t stat; 973 stat = cusparseCreate(&MAT_cusparseHandle);CHKERRCUSP(stat); 974 } 975 /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the 976 default cusparse tri solve. Note the difference with the implementation in 977 MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */ 978 ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatGetFactor_petsc_C","MatGetFactor_seqaij_cusparse",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr); 979 980 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 981 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 982 B->ops->getvecs = MatGetVecs_SeqAIJCUSPARSE; 983 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 984 B->ops->mult = MatMult_SeqAIJCUSPARSE; 985 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 986 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 987 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 988 989 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 990 991 B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 992 993 ierr = PetscObjectComposeFunctionDynamic((PetscObject)B, "MatCUSPARSESetFormat_C", "MatCUSPARSESetFormat_SeqAIJCUSPARSE", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 994 PetscFunctionReturn(0); 995 } 996 EXTERN_C_END 997 998 /*M 999 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1000 1001 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1002 CSR, ELL, or Hybrid format. All matrix calculations are performed on Nvidia GPUs using 1003 the CUSPARSE library. This type is only available when using the 'txpetscgpu' package. 1004 Use --download-txpetscgpu to build/install PETSc to use different CUSPARSE library and 1005 the different GPU storage formats. 1006 1007 Options Database Keys: 1008 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1009 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package. 1010 . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package. 1011 - -mat_cusparse_solve_storage_format csr - sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package. 1012 1013 Level: beginner 1014 1015 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1016 M*/ 1017