1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format. 4 */ 5 6 #include "petscconf.h" 7 PETSC_CUDA_EXTERN_C_BEGIN 8 #include "../src/mat/impls/aij/seq/aij.h" /*I "petscmat.h" I*/ 9 //#include "petscbt.h" 10 #include "../src/vec/vec/impls/dvecimpl.h" 11 #include "petsc-private/vecimpl.h" 12 PETSC_CUDA_EXTERN_C_END 13 #undef VecType 14 #include "cusparsematimpl.h" 15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 16 17 /* this is such a hack ... but I don't know of another way to pass this variable 18 from one GPU_Matrix_Ifc class to another. This is necessary for the parallel 19 SpMV. Essentially, I need to use the same stream variable in two different 20 data structures. I do this by creating a single instance of that stream 21 and reuse it. */ 22 cudaStream_t theBodyStream=0; 23 24 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 25 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 26 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 27 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 28 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 29 PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 30 PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 31 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat); 32 PetscErrorCode MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(Mat); 33 PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 34 PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 35 PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 36 PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 37 38 #undef __FUNCT__ 39 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse" 40 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type) 41 { 42 PetscFunctionBegin; 43 *type = MATSOLVERCUSPARSE; 44 PetscFunctionReturn(0); 45 } 46 47 EXTERN_C_BEGIN 48 extern PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*); 49 EXTERN_C_END 50 /*MC 51 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices on a single GPU of type, 52 seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported algorithms are ILU(k) and LU. ICC(k) and 53 Cholesky will be supported in future versions. This class does NOT support direct solver operations. 54 55 ./configure --download-txpetscgpu to install PETSc to use CUSPARSE 56 57 Consult CUSPARSE documentation for more information about the matrix storage formats which correspond to the options 58 database keys below. 59 60 Options Database Keys: 61 . -mat_cusparse_solve_storage_format csr - sets the storage format matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 62 63 Level: beginner 64 65 .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 66 M*/ 67 68 EXTERN_C_BEGIN 69 #undef __FUNCT__ 70 #define __FUNCT__ "MatGetFactor_seqaij_cusparse" 71 PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B) 72 { 73 PetscErrorCode ierr; 74 75 PetscFunctionBegin; 76 ierr = MatGetFactor_seqaij_petsc(A,ftype,B);CHKERRQ(ierr); 77 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 78 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 79 ierr = MatSetFromOptions_SeqAIJCUSPARSE(*B);CHKERRQ(ierr); 80 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C","MatFactorGetSolverPackage_seqaij_cusparse",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr); 81 82 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 83 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 84 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 85 (*B)->factortype = ftype; 86 PetscFunctionReturn(0); 87 } 88 EXTERN_C_END 89 90 EXTERN_C_BEGIN 91 #undef __FUNCT__ 92 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE" 93 PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 94 { 95 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 96 97 PetscFunctionBegin; 98 switch (op) { 99 case MAT_CUSPARSE_MULT: 100 cusparseMat->format = format; 101 break; 102 case MAT_CUSPARSE_SOLVE: 103 cusparseMatSolveStorageFormat = format; 104 break; 105 case MAT_CUSPARSE_ALL: 106 cusparseMat->format = format; 107 cusparseMatSolveStorageFormat = format; 108 break; 109 default: 110 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL are currently supported.",op); 111 } 112 PetscFunctionReturn(0); 113 } 114 EXTERN_C_END 115 116 117 /*@ 118 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 119 operation. Only the MatMult operation can use different GPU storage formats 120 for MPIAIJCUSPARSE matrices. This requires the txpetscgpu package. Use --download-txpetscgpu 121 to build/install PETSc to use this package. 122 123 Not Collective 124 125 Input Parameters: 126 + A - Matrix of type SEQAIJCUSPARSE 127 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 128 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB) 129 130 Output Parameter: 131 132 Level: intermediate 133 134 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 135 @*/ 136 #undef __FUNCT__ 137 #define __FUNCT__ "MatCUSPARSESetFormat" 138 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 139 { 140 PetscErrorCode ierr; 141 142 PetscFunctionBegin; 143 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 144 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 145 PetscFunctionReturn(0); 146 } 147 148 #undef __FUNCT__ 149 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE" 150 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A) 151 { 152 PetscErrorCode ierr; 153 MatCUSPARSEStorageFormat format; 154 PetscBool flg; 155 156 PetscFunctionBegin; 157 ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr); 158 ierr = PetscObjectOptionsBegin((PetscObject)A); 159 if (A->factortype==MAT_FACTOR_NONE) { 160 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 161 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 162 if (flg) { 163 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 164 } 165 } else { 166 ierr = PetscOptionsEnum("-mat_cusparse_solve_storage_format","sets storage format of (seq)aijcusparse gpu matrices for TriSolve", 167 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 168 if (flg) { 169 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_SOLVE,format);CHKERRQ(ierr); 170 } 171 } 172 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 173 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 174 if (flg) { 175 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 176 } 177 ierr = PetscOptionsEnd();CHKERRQ(ierr); 178 PetscFunctionReturn(0); 179 180 } 181 182 #undef __FUNCT__ 183 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE" 184 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 185 { 186 PetscErrorCode ierr; 187 188 PetscFunctionBegin; 189 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 190 191 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 192 PetscFunctionReturn(0); 193 } 194 195 #undef __FUNCT__ 196 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE" 197 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 198 { 199 PetscErrorCode ierr; 200 201 PetscFunctionBegin; 202 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 203 204 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 205 PetscFunctionReturn(0); 206 } 207 208 #undef __FUNCT__ 209 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildLowerTriMatrix" 210 PetscErrorCode MatSeqAIJCUSPARSEBuildLowerTriMatrix(Mat A) 211 { 212 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 213 PetscInt n = A->rmap->n; 214 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 215 GPU_Matrix_Ifc * cusparseMat = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 216 cusparseStatus_t stat; 217 const PetscInt *ai = a->i,*aj = a->j,*vi; 218 const MatScalar *aa = a->a,*v; 219 PetscErrorCode ierr; 220 PetscInt *AiLo, *AjLo; 221 PetscScalar *AALo; 222 PetscInt i,nz, nzLower, offset, rowOffset; 223 224 PetscFunctionBegin; 225 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 226 try { 227 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 228 nzLower=n+ai[n]-ai[1]; 229 230 /* Allocate Space for the lower triangular matrix */ 231 ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 232 ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr); 233 ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr); 234 235 /* Fill the lower triangular matrix */ 236 AiLo[0] = (PetscInt) 0; 237 AiLo[n] = nzLower; 238 AjLo[0] = (PetscInt) 0; 239 AALo[0] = (MatScalar) 1.0; 240 v = aa; 241 vi = aj; 242 offset = 1; 243 rowOffset= 1; 244 for (i=1; i<n; i++) { 245 nz = ai[i+1] - ai[i]; 246 /* additional 1 for the term on the diagonal */ 247 AiLo[i] = rowOffset; 248 rowOffset += nz+1; 249 250 ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 251 ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 252 253 offset += nz; 254 AjLo[offset] = (PetscInt) i; 255 AALo[offset] = (MatScalar) 1.0; 256 offset += 1; 257 258 v += nz; 259 vi += nz; 260 } 261 cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]); 262 263 stat = cusparseMat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat); 264 ierr = cusparseMat->setMatrix(n, n, nzLower, AiLo, AjLo, AALo);CHKERRCUSP(ierr); 265 stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat); 266 267 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMat; 268 269 ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr); 270 ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr); 271 ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 272 } catch(char *ex) { 273 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 274 } 275 } 276 PetscFunctionReturn(0); 277 } 278 279 #undef __FUNCT__ 280 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildUpperTriMatrix" 281 PetscErrorCode MatSeqAIJCUSPARSEBuildUpperTriMatrix(Mat A) 282 { 283 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 284 PetscInt n = A->rmap->n; 285 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 286 GPU_Matrix_Ifc * cusparseMat = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 287 cusparseStatus_t stat; 288 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 289 const MatScalar *aa = a->a,*v; 290 PetscInt *AiUp, *AjUp; 291 PetscScalar *AAUp; 292 PetscInt i,nz, nzUpper, offset; 293 PetscErrorCode ierr; 294 295 PetscFunctionBegin; 296 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 297 try { 298 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 299 nzUpper = adiag[0]-adiag[n]; 300 301 /* Allocate Space for the upper triangular matrix */ 302 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 303 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 304 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 305 306 /* Fill the upper triangular matrix */ 307 AiUp[0]=(PetscInt) 0; 308 AiUp[n]=nzUpper; 309 offset = nzUpper; 310 for (i=n-1; i>=0; i--) { 311 v = aa + adiag[i+1] + 1; 312 vi = aj + adiag[i+1] + 1; 313 314 /* number of elements NOT on the diagonal */ 315 nz = adiag[i] - adiag[i+1]-1; 316 317 /* decrement the offset */ 318 offset -= (nz+1); 319 320 /* first, set the diagonal elements */ 321 AjUp[offset] = (PetscInt) i; 322 AAUp[offset] = 1./v[nz]; 323 AiUp[i] = AiUp[i+1] - (nz+1); 324 325 ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 326 ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 327 } 328 cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]); 329 330 stat = cusparseMat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 331 ierr = cusparseMat->setMatrix(n, n, nzUpper, AiUp, AjUp, AAUp);CHKERRCUSP(ierr); 332 stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat); 333 334 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMat; 335 336 ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 337 ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 338 ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 339 } catch(char *ex) { 340 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 341 } 342 } 343 PetscFunctionReturn(0); 344 } 345 346 #undef __FUNCT__ 347 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalysisAndCopyToGPU" 348 PetscErrorCode MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(Mat A) 349 { 350 PetscErrorCode ierr; 351 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 352 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 353 IS isrow = a->row,iscol = a->icol; 354 PetscBool row_identity,col_identity; 355 const PetscInt *r,*c; 356 PetscInt n = A->rmap->n; 357 358 PetscFunctionBegin; 359 ierr = MatSeqAIJCUSPARSEBuildLowerTriMatrix(A);CHKERRQ(ierr); 360 ierr = MatSeqAIJCUSPARSEBuildUpperTriMatrix(A);CHKERRQ(ierr); 361 362 cusparseTriFactors->tempvec = new CUSPARRAY; 363 cusparseTriFactors->tempvec->resize(n); 364 365 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 366 /*lower triangular indices */ 367 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 368 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 369 if (!row_identity) { 370 ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(r, n);CHKERRCUSP(ierr); 371 } 372 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 373 374 /*upper triangular indices */ 375 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 376 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 377 if (!col_identity) { 378 ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(c, n);CHKERRCUSP(ierr); 379 } 380 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 381 PetscFunctionReturn(0); 382 } 383 384 #undef __FUNCT__ 385 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE" 386 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 387 { 388 PetscErrorCode ierr; 389 Mat_SeqAIJ *b =(Mat_SeqAIJ*)B->data; 390 IS isrow = b->row,iscol = b->col; 391 PetscBool row_identity,col_identity; 392 393 PetscFunctionBegin; 394 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 395 /* determine which version of MatSolve needs to be used. */ 396 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 397 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 398 if (row_identity && col_identity) { 399 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 400 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 401 } else { 402 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 403 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 404 } 405 406 /* get the triangular factors */ 407 ierr = MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 408 PetscFunctionReturn(0); 409 } 410 411 412 #undef __FUNCT__ 413 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve" 414 PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 415 { 416 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 417 GPU_Matrix_Ifc* cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 418 GPU_Matrix_Ifc* cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 419 cusparseStatus_t stat; 420 PetscFunctionBegin; 421 stat = cusparseMatLo->initializeCusparseMat(MAT_cusparseHandle, 422 CUSPARSE_MATRIX_TYPE_TRIANGULAR, 423 CUSPARSE_FILL_MODE_UPPER, 424 TRANSPOSE);CHKERRCUSP(stat); 425 stat = cusparseMatLo->solveAnalysis(TRANSPOSE);CHKERRCUSP(stat); 426 427 stat = cusparseMatUp->initializeCusparseMat(MAT_cusparseHandle, 428 CUSPARSE_MATRIX_TYPE_TRIANGULAR, 429 CUSPARSE_FILL_MODE_LOWER, 430 TRANSPOSE);CHKERRCUSP(stat); 431 stat = cusparseMatUp->solveAnalysis(TRANSPOSE);CHKERRCUSP(stat); 432 PetscFunctionReturn(0); 433 } 434 435 #undef __FUNCT__ 436 #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult" 437 PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 438 { 439 PetscErrorCode ierr; 440 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 441 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 442 cusparseStatus_t stat; 443 PetscFunctionBegin; 444 stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER, TRANSPOSE);CHKERRCUSP(stat); 445 ierr = cusparseMat->mat->setMatrix(A->rmap->n, A->cmap->n, a->nz, a->i, a->j, a->a, TRANSPOSE);CHKERRCUSP(ierr); 446 PetscFunctionReturn(0); 447 } 448 449 #undef __FUNCT__ 450 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE" 451 PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 452 { 453 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 454 PetscErrorCode ierr; 455 CUSPARRAY *xGPU, *bGPU; 456 cusparseStatus_t stat; 457 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 458 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 459 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 460 CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec; 461 462 PetscFunctionBegin; 463 /* Analyze the matrix ... on the fly */ 464 if (!cusparseTriFactors->hasTranspose) { 465 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 466 cusparseTriFactors->hasTranspose=PETSC_TRUE; 467 } 468 469 /* Get the GPU pointers */ 470 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 471 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 472 473 /* solve with reordering */ 474 ierr = cusparseMatUp->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr); 475 stat = cusparseMatUp->solve(xGPU, tempGPU, TRANSPOSE);CHKERRCUSP(stat); 476 stat = cusparseMatLo->solve(tempGPU, xGPU, TRANSPOSE);CHKERRCUSP(stat); 477 ierr = cusparseMatLo->reorderOut(xGPU);CHKERRCUSP(ierr); 478 479 /* restore */ 480 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 481 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 482 ierr = WaitForGPU();CHKERRCUSP(ierr); 483 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 484 PetscFunctionReturn(0); 485 } 486 487 #undef __FUNCT__ 488 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering" 489 PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 490 { 491 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 492 PetscErrorCode ierr; 493 CUSPARRAY *xGPU, *bGPU; 494 cusparseStatus_t stat; 495 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 496 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 497 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 498 CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec; 499 500 PetscFunctionBegin; 501 /* Analyze the matrix ... on the fly */ 502 if (!cusparseTriFactors->hasTranspose) { 503 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 504 cusparseTriFactors->hasTranspose=PETSC_TRUE; 505 } 506 507 /* Get the GPU pointers */ 508 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 509 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 510 511 /* solve */ 512 stat = cusparseMatUp->solve(bGPU, tempGPU, TRANSPOSE);CHKERRCUSP(stat); 513 stat = cusparseMatLo->solve(tempGPU, xGPU, TRANSPOSE);CHKERRCUSP(stat); 514 515 /* restore */ 516 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 517 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 518 ierr = WaitForGPU();CHKERRCUSP(ierr); 519 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 520 PetscFunctionReturn(0); 521 } 522 523 524 #undef __FUNCT__ 525 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE" 526 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 527 { 528 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 529 PetscErrorCode ierr; 530 CUSPARRAY *xGPU, *bGPU; 531 cusparseStatus_t stat; 532 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 533 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 534 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 535 CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec; 536 537 PetscFunctionBegin; 538 /* Get the GPU pointers */ 539 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 540 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 541 542 /* solve with reordering */ 543 ierr = cusparseMatLo->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr); 544 stat = cusparseMatLo->solve(xGPU, tempGPU);CHKERRCUSP(stat); 545 stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat); 546 ierr = cusparseMatUp->reorderOut(xGPU);CHKERRCUSP(ierr); 547 548 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 549 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 550 ierr = WaitForGPU();CHKERRCUSP(ierr); 551 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 552 PetscFunctionReturn(0); 553 } 554 555 556 557 558 #undef __FUNCT__ 559 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering" 560 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 561 { 562 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 563 PetscErrorCode ierr; 564 CUSPARRAY *xGPU, *bGPU; 565 cusparseStatus_t stat; 566 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 567 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 568 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 569 CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec; 570 571 PetscFunctionBegin; 572 /* Get the GPU pointers */ 573 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 574 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 575 576 /* solve */ 577 stat = cusparseMatLo->solve(bGPU, tempGPU);CHKERRCUSP(stat); 578 stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat); 579 580 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 581 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 582 ierr = WaitForGPU();CHKERRCUSP(ierr); 583 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 584 PetscFunctionReturn(0); 585 } 586 587 #undef __FUNCT__ 588 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU" 589 PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 590 { 591 592 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 593 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 594 PetscInt m = A->rmap->n,*ii,*ridx; 595 PetscErrorCode ierr; 596 597 598 PetscFunctionBegin; 599 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 600 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 601 /* 602 It may be possible to reuse nonzero structure with new matrix values but 603 for simplicity and insured correctness we delete and build a new matrix on 604 the GPU. Likely a very small performance hit. 605 */ 606 if (cusparseMat->mat) { 607 try { 608 delete cusparseMat->mat; 609 if (cusparseMat->tempvec) delete cusparseMat->tempvec; 610 611 } catch(char *ex) { 612 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 613 } 614 } 615 try { 616 cusparseMat->nonzerorow=0; 617 for (int j = 0; j<m; j++) cusparseMat->nonzerorow += ((a->i[j+1]-a->i[j])>0); 618 619 if (a->compressedrow.use) { 620 m = a->compressedrow.nrows; 621 ii = a->compressedrow.i; 622 ridx = a->compressedrow.rindex; 623 } else { 624 /* Forcing compressed row on the GPU ... only relevant for CSR storage */ 625 int k=0; 626 ierr = PetscMalloc((cusparseMat->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr); 627 ierr = PetscMalloc((cusparseMat->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr); 628 ii[0]=0; 629 for (int j = 0; j<m; j++) { 630 if ((a->i[j+1]-a->i[j])>0) { 631 ii[k] = a->i[j]; 632 ridx[k]= j; 633 k++; 634 } 635 } 636 ii[cusparseMat->nonzerorow] = a->nz; 637 638 m = cusparseMat->nonzerorow; 639 } 640 641 /* Build our matrix ... first determine the GPU storage type */ 642 cusparseMat->mat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseMat->format]); 643 644 /* Create the streams and events (if desired). */ 645 PetscMPIInt size; 646 ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr); 647 ierr = cusparseMat->mat->buildStreamsAndEvents(size, &theBodyStream);CHKERRCUSP(ierr); 648 649 /* FILL MODE UPPER is irrelevant */ 650 cusparseStatus_t stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 651 652 /* lastly, build the matrix */ 653 ierr = cusparseMat->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);CHKERRCUSP(ierr); 654 cusparseMat->mat->setCPRowIndices(ridx, m); 655 if (!a->compressedrow.use) { 656 ierr = PetscFree(ii);CHKERRQ(ierr); 657 ierr = PetscFree(ridx);CHKERRQ(ierr); 658 } 659 cusparseMat->tempvec = new CUSPARRAY; 660 cusparseMat->tempvec->resize(m); 661 } catch(char *ex) { 662 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 663 } 664 ierr = WaitForGPU();CHKERRCUSP(ierr); 665 666 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 667 668 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 669 } 670 PetscFunctionReturn(0); 671 } 672 673 #undef __FUNCT__ 674 #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE" 675 PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left) 676 { 677 PetscErrorCode ierr; 678 679 PetscFunctionBegin; 680 if (right) { 681 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 682 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 683 ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr); 684 ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr); 685 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 686 } 687 if (left) { 688 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 689 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 690 ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr); 691 ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr); 692 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 693 } 694 PetscFunctionReturn(0); 695 } 696 697 #undef __FUNCT__ 698 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE" 699 PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 700 { 701 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 702 PetscErrorCode ierr; 703 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 704 CUSPARRAY *xarray,*yarray; 705 706 PetscFunctionBegin; 707 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 708 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 709 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 710 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 711 try { 712 ierr = cusparseMat->mat->multiply(xarray, yarray);CHKERRCUSP(ierr); 713 } catch (char *ex) { 714 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 715 } 716 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 717 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 718 if (!cusparseMat->mat->hasNonZeroStream()) { 719 ierr = WaitForGPU();CHKERRCUSP(ierr); 720 } 721 ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr); 722 PetscFunctionReturn(0); 723 } 724 725 726 #undef __FUNCT__ 727 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE" 728 PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 729 { 730 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 731 PetscErrorCode ierr; 732 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 733 CUSPARRAY *xarray,*yarray; 734 735 PetscFunctionBegin; 736 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 737 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 738 if (!cusparseMat->hasTranspose) { 739 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 740 cusparseMat->hasTranspose=PETSC_TRUE; 741 } 742 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 743 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 744 try { 745 ierr = cusparseMat->mat->multiply(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr); 746 } catch (char *ex) { 747 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 748 } 749 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 750 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 751 if (!cusparseMat->mat->hasNonZeroStream()) { 752 ierr = WaitForGPU();CHKERRCUSP(ierr); 753 } 754 ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr); 755 PetscFunctionReturn(0); 756 } 757 758 #undef __FUNCT__ 759 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 760 PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 761 { 762 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 763 PetscErrorCode ierr; 764 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 765 CUSPARRAY *xarray,*yarray,*zarray; 766 767 PetscFunctionBegin; 768 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 769 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 770 try { 771 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 772 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 773 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 774 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 775 776 /* multiply add */ 777 ierr = cusparseMat->mat->multiplyAdd(xarray, zarray);CHKERRCUSP(ierr); 778 779 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 780 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 781 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 782 783 } catch(char *ex) { 784 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 785 } 786 ierr = WaitForGPU();CHKERRCUSP(ierr); 787 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 788 PetscFunctionReturn(0); 789 } 790 791 #undef __FUNCT__ 792 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 793 PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 794 { 795 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 796 PetscErrorCode ierr; 797 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 798 CUSPARRAY *xarray,*yarray,*zarray; 799 800 PetscFunctionBegin; 801 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 802 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 803 if (!cusparseMat->hasTranspose) { 804 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 805 cusparseMat->hasTranspose=PETSC_TRUE; 806 } 807 try { 808 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 809 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 810 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 811 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 812 813 /* multiply add with matrix transpose */ 814 ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr); 815 816 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 817 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 818 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 819 820 } catch(char *ex) { 821 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 822 } 823 ierr = WaitForGPU();CHKERRCUSP(ierr); 824 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 825 PetscFunctionReturn(0); 826 } 827 828 #undef __FUNCT__ 829 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE" 830 PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 831 { 832 PetscErrorCode ierr; 833 834 PetscFunctionBegin; 835 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 836 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 837 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 838 A->ops->mult = MatMult_SeqAIJCUSPARSE; 839 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 840 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 841 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 842 PetscFunctionReturn(0); 843 } 844 845 /* --------------------------------------------------------------------------------*/ 846 #undef __FUNCT__ 847 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE" 848 /*@ 849 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 850 (the default parallel PETSc format). This matrix will ultimately pushed down 851 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 852 assembly performance the user should preallocate the matrix storage by setting 853 the parameter nz (or the array nnz). By setting these parameters accurately, 854 performance during matrix assembly can be increased by more than a factor of 50. 855 856 Collective on MPI_Comm 857 858 Input Parameters: 859 + comm - MPI communicator, set to PETSC_COMM_SELF 860 . m - number of rows 861 . n - number of columns 862 . nz - number of nonzeros per row (same for all rows) 863 - nnz - array containing the number of nonzeros in the various rows 864 (possibly different for each row) or NULL 865 866 Output Parameter: 867 . A - the matrix 868 869 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 870 MatXXXXSetPreallocation() paradgm instead of this routine directly. 871 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 872 873 Notes: 874 If nnz is given then nz is ignored 875 876 The AIJ format (also called the Yale sparse matrix format or 877 compressed row storage), is fully compatible with standard Fortran 77 878 storage. That is, the stored row and column indices can begin at 879 either one (as in Fortran) or zero. See the users' manual for details. 880 881 Specify the preallocated storage with either nz or nnz (not both). 882 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 883 allocation. For large problems you MUST preallocate memory or you 884 will get TERRIBLE performance, see the users' manual chapter on matrices. 885 886 By default, this format uses inodes (identical nodes) when possible, to 887 improve numerical efficiency of matrix-vector products and solves. We 888 search for consecutive rows with the same nonzero structure, thereby 889 reusing matrix information to achieve increased efficiency. 890 891 Level: intermediate 892 893 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 894 @*/ 895 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 896 { 897 PetscErrorCode ierr; 898 899 PetscFunctionBegin; 900 ierr = MatCreate(comm,A);CHKERRQ(ierr); 901 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 902 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 903 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 904 PetscFunctionReturn(0); 905 } 906 907 #undef __FUNCT__ 908 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE" 909 PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 910 { 911 PetscErrorCode ierr; 912 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 913 914 PetscFunctionBegin; 915 if (A->factortype==MAT_FACTOR_NONE) { 916 try { 917 if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) { 918 delete (GPU_Matrix_Ifc*)(cusparseMat->mat); 919 } 920 if (cusparseMat->tempvec!=0) delete cusparseMat->tempvec; 921 delete cusparseMat; 922 A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 923 } catch(char *ex) { 924 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 925 } 926 } else { 927 /* The triangular factors */ 928 try { 929 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 930 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 931 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 932 delete (GPU_Matrix_Ifc*) cusparseMatLo; 933 delete (GPU_Matrix_Ifc*) cusparseMatUp; 934 delete (CUSPARRAY*) cusparseTriFactors->tempvec; 935 delete cusparseTriFactors; 936 } catch(char *ex) { 937 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 938 } 939 } 940 if (MAT_cusparseHandle) { 941 cusparseStatus_t stat; 942 stat = cusparseDestroy(MAT_cusparseHandle);CHKERRCUSP(stat); 943 944 MAT_cusparseHandle=0; 945 } 946 /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */ 947 A->spptr = 0; 948 949 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 950 PetscFunctionReturn(0); 951 } 952 953 EXTERN_C_BEGIN 954 #undef __FUNCT__ 955 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE" 956 PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 957 { 958 PetscErrorCode ierr; 959 960 PetscFunctionBegin; 961 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 962 if (B->factortype==MAT_FACTOR_NONE) { 963 /* you cannot check the inode.use flag here since the matrix was just created. 964 now build a GPU matrix data structure */ 965 B->spptr = new Mat_SeqAIJCUSPARSE; 966 967 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 968 ((Mat_SeqAIJCUSPARSE*)B->spptr)->tempvec = 0; 969 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 970 ((Mat_SeqAIJCUSPARSE*)B->spptr)->hasTranspose = PETSC_FALSE; 971 } else { 972 /* NEXT, set the pointers to the triangular factors */ 973 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 974 975 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 976 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 977 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->tempvec = 0; 978 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->format = cusparseMatSolveStorageFormat; 979 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->hasTranspose = PETSC_FALSE; 980 } 981 /* Create a single instance of the MAT_cusparseHandle for any matrix (matMult, TriSolve, ...) */ 982 if (!MAT_cusparseHandle) { 983 cusparseStatus_t stat; 984 stat = cusparseCreate(&MAT_cusparseHandle);CHKERRCUSP(stat); 985 } 986 /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the 987 default cusparse tri solve. Note the difference with the implementation in 988 MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */ 989 ierr = PetscObjectComposeFunction((PetscObject)B,"MatGetFactor_petsc_C","MatGetFactor_seqaij_cusparse",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr); 990 991 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 992 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 993 B->ops->getvecs = MatGetVecs_SeqAIJCUSPARSE; 994 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 995 B->ops->mult = MatMult_SeqAIJCUSPARSE; 996 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 997 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 998 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 999 1000 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1001 1002 B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 1003 1004 ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", "MatCUSPARSESetFormat_SeqAIJCUSPARSE", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1005 PetscFunctionReturn(0); 1006 } 1007 EXTERN_C_END 1008 1009 /*M 1010 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1011 1012 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1013 CSR, ELL, or Hybrid format. All matrix calculations are performed on Nvidia GPUs using 1014 the CUSPARSE library. This type is only available when using the 'txpetscgpu' package. 1015 Use --download-txpetscgpu to build/install PETSc to use different CUSPARSE library and 1016 the different GPU storage formats. 1017 1018 Options Database Keys: 1019 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1020 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package. 1021 . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package. 1022 - -mat_cusparse_solve_storage_format csr - sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package. 1023 1024 Level: beginner 1025 1026 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1027 M*/ 1028