1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format. 4 */ 5 6 #include "petscconf.h" 7 PETSC_CUDA_EXTERN_C_BEGIN 8 #include "../src/mat/impls/aij/seq/aij.h" /*I "petscmat.h" I*/ 9 //#include "petscbt.h" 10 #include "../src/vec/vec/impls/dvecimpl.h" 11 #include "petsc-private/vecimpl.h" 12 PETSC_CUDA_EXTERN_C_END 13 #undef VecType 14 #include "cusparsematimpl.h" 15 16 // this is such a hack ... but I don't know of another way to pass this variable 17 // from one GPU_Matrix_Ifc class to another. This is necessary for the parallel 18 // SpMV. Essentially, I need to use the same stream variable in two different 19 // data structures. I do this by creating a single instance of that stream 20 // and reuse it. 21 cudaStream_t theBodyStream=0; 22 23 EXTERN_C_BEGIN 24 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 25 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 26 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo *); 27 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 28 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 29 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat); 30 PetscErrorCode MatCUSPARSEAnalysisAndCopyToGPU(Mat); 31 PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 32 PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 33 PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 34 PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 35 EXTERN_C_END 36 37 EXTERN_C_BEGIN 38 #undef __FUNCT__ 39 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse" 40 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type) 41 { 42 PetscFunctionBegin; 43 *type = MATSOLVERCUSPARSE; 44 PetscFunctionReturn(0); 45 } 46 EXTERN_C_END 47 48 EXTERN_C_BEGIN 49 extern PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*); 50 EXTERN_C_END 51 52 53 54 /*MC 55 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers (ILU) for seq matrices 56 on the GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp 57 5~ 58 ./configure --download-txpetscgpu to install PETSc to use CUSPARSE solves 59 60 Options Database Keys: 61 + -mat_mult_cusparse_storage_format <CSR> - (choose one of) CSR, ELL, HYB 62 + -mat_solve_cusparse_storage_format <CSR> - (choose one of) CSR, ELL, HYB 63 64 Level: beginner 65 M*/ 66 67 EXTERN_C_BEGIN 68 #undef __FUNCT__ 69 #define __FUNCT__ "MatGetFactor_seqaij_cusparse" 70 PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B) 71 { 72 PetscErrorCode ierr; 73 74 PetscFunctionBegin; 75 ierr = MatGetFactor_seqaij_petsc(A,ftype,B);CHKERRQ(ierr); 76 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT){ 77 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 78 ierr = MatSetFromOptions_SeqAIJCUSPARSE(*B);CHKERRQ(ierr); 79 ierr = PetscObjectComposeFunctionDynamic((PetscObject)(*B),"MatFactorGetSolverPackage_C","MatFactorGetSolverPackage_seqaij_cusparse",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr); 80 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 81 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 82 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 83 (*B)->factortype = ftype; 84 PetscFunctionReturn(0); 85 } 86 EXTERN_C_END 87 88 #undef __FUNCT__ 89 #define __FUNCT__ "MatAIJCUSPARSESetGPUStorageFormatForMatSolve" 90 PetscErrorCode MatAIJCUSPARSESetGPUStorageFormatForMatSolve(Mat A,GPUStorageFormat format) 91 { 92 PetscErrorCode ierr; 93 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 94 GPU_Matrix_Ifc* cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 95 GPU_Matrix_Ifc* cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 96 PetscFunctionBegin; 97 cusparseTriFactors->format = format; 98 if (cusparseMatLo && cusparseMatUp) { 99 // If data exists already delete 100 if (cusparseMatLo) 101 delete (GPU_Matrix_Ifc *)cusparseMatLo; 102 if (cusparseMatUp) 103 delete (GPU_Matrix_Ifc *)cusparseMatUp; 104 105 // key data was deleted so reset the flag. 106 A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 107 108 // rebuild matrix 109 ierr=MatCUSPARSEAnalysisAndCopyToGPU(A);CHKERRQ(ierr); 110 } 111 PetscFunctionReturn(0); 112 } 113 114 //EXTERN_C_BEGIN 115 #undef __FUNCT__ 116 #define __FUNCT__ "MatSetOption_SeqAIJCUSPARSE" 117 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 118 { 119 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 120 PetscErrorCode ierr; 121 PetscFunctionBegin; 122 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 123 switch (op) { 124 case MAT_DIAGBLOCK_CSR: 125 case MAT_OFFDIAGBLOCK_CSR: 126 case MAT_CSR: 127 cusparseMat->format = CSR; 128 break; 129 case MAT_DIAGBLOCK_DIA: 130 case MAT_OFFDIAGBLOCK_DIA: 131 case MAT_DIA: 132 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unsupported GPU matrix storage format DIA for (MPI,SEQ)AIJCUSPARSE matrix type."); 133 case MAT_DIAGBLOCK_ELL: 134 case MAT_OFFDIAGBLOCK_ELL: 135 case MAT_ELL: 136 cusparseMat->format = ELL; 137 break; 138 case MAT_DIAGBLOCK_HYB: 139 case MAT_OFFDIAGBLOCK_HYB: 140 case MAT_HYB: 141 cusparseMat->format = HYB; 142 break; 143 default: 144 break; 145 } 146 PetscFunctionReturn(0); 147 } 148 //EXTERN_C_END 149 150 //EXTERN_C_BEGIN 151 #undef __FUNCT__ 152 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE" 153 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A) 154 { 155 PetscErrorCode ierr; 156 PetscInt idx=0; 157 char * formats[] ={CSR,ELL,HYB}; 158 MatOption format; 159 PetscBool flg; 160 PetscFunctionBegin; 161 ierr = PetscOptionsBegin(((PetscObject)A)->comm,((PetscObject)A)->prefix,"When using TxPETSCGPU, AIJCUSPARSE Options","Mat");CHKERRQ(ierr); 162 if (A->factortype==MAT_FACTOR_NONE) { 163 ierr = PetscOptionsEList("-mat_mult_cusparse_storage_format", 164 "Set the storage format of (seq)aijcusparse gpu matrices for SpMV", 165 "None",formats,3,formats[0],&idx,&flg);CHKERRQ(ierr); 166 switch (idx) 167 { 168 case 0: 169 format=MAT_CSR; 170 break; 171 case 2: 172 format=MAT_ELL; 173 break; 174 case 3: 175 format=MAT_HYB; 176 break; 177 } 178 ierr=MatSetOption_SeqAIJCUSPARSE(A,format,PETSC_TRUE);CHKERRQ(ierr); 179 } 180 else { 181 ierr = PetscOptionsEList("-mat_solve_cusparse_storage_format", 182 "Set the storage format of (seq)aijcusparse gpu matrices for TriSolve", 183 "None",formats,3,formats[0],&idx,&flg);CHKERRQ(ierr); 184 ierr=MatAIJCUSPARSESetGPUStorageFormatForMatSolve(A,formats[idx]);CHKERRQ(ierr); 185 } 186 ierr = PetscOptionsEnd();CHKERRQ(ierr); 187 PetscFunctionReturn(0); 188 189 } 190 //EXTERN_C_END 191 192 #undef __FUNCT__ 193 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE" 194 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 195 { 196 PetscErrorCode ierr; 197 198 PetscFunctionBegin; 199 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 200 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 201 PetscFunctionReturn(0); 202 } 203 204 #undef __FUNCT__ 205 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE" 206 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 207 { 208 PetscErrorCode ierr; 209 210 PetscFunctionBegin; 211 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 212 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 213 PetscFunctionReturn(0); 214 } 215 216 #undef __FUNCT__ 217 #define __FUNCT__ "MatCUSPARSEBuildLowerTriMatrix" 218 PetscErrorCode MatCUSPARSEBuildLowerTriMatrix(Mat A) 219 { 220 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 221 PetscInt n = A->rmap->n; 222 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 223 GPU_Matrix_Ifc* cusparseMat = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 224 cusparseStatus_t stat; 225 const PetscInt *ai = a->i,*aj = a->j,*vi; 226 const MatScalar *aa = a->a,*v; 227 PetscErrorCode ierr; 228 PetscInt *AiLo, *AjLo; 229 PetscScalar *AALo; 230 PetscInt i,nz, nzLower, offset, rowOffset; 231 232 PetscFunctionBegin; 233 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){ 234 try { 235 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 236 nzLower=n+ai[n]-ai[1]; 237 238 /* Allocate Space for the lower triangular matrix */ 239 ierr = cudaMallocHost((void **) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 240 ierr = cudaMallocHost((void **) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr); 241 ierr = cudaMallocHost((void **) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr); 242 243 /* Fill the lower triangular matrix */ 244 AiLo[0]=(PetscInt) 0; 245 AiLo[n]=nzLower; 246 AjLo[0]=(PetscInt) 0; 247 AALo[0]=(MatScalar) 1.0; 248 v = aa; 249 vi = aj; 250 offset=1; 251 rowOffset=1; 252 for (i=1; i<n; i++) { 253 nz = ai[i+1] - ai[i]; 254 // additional 1 for the term on the diagonal 255 AiLo[i]=rowOffset; 256 rowOffset+=nz+1; 257 258 ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 259 ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 260 261 offset+=nz; 262 AjLo[offset]=(PetscInt) i; 263 AALo[offset]=(MatScalar) 1.0; 264 offset+=1; 265 266 v += nz; 267 vi += nz; 268 } 269 cusparseMat = GPU_Matrix_Factory::getNew(cusparseTriFactors->format); 270 stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat); 271 ierr = cusparseMat->setMatrix(n, n, nzLower, AiLo, AjLo, AALo);CHKERRCUSP(ierr); 272 stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat); 273 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMat; 274 // free temporary data 275 ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr); 276 ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr); 277 ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 278 } catch(char* ex) { 279 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 280 } 281 } 282 PetscFunctionReturn(0); 283 } 284 285 #undef __FUNCT__ 286 #define __FUNCT__ "MatCUSPARSEBuildUpperTriMatrix" 287 PetscErrorCode MatCUSPARSEBuildUpperTriMatrix(Mat A) 288 { 289 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 290 PetscInt n = A->rmap->n; 291 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 292 GPU_Matrix_Ifc* cusparseMat = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 293 cusparseStatus_t stat; 294 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 295 const MatScalar *aa = a->a,*v; 296 PetscInt *AiUp, *AjUp; 297 PetscScalar *AAUp; 298 PetscInt i,nz, nzUpper, offset; 299 PetscErrorCode ierr; 300 301 PetscFunctionBegin; 302 303 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){ 304 try { 305 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 306 nzUpper = adiag[0]-adiag[n]; 307 308 /* Allocate Space for the upper triangular matrix */ 309 ierr = cudaMallocHost((void **) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 310 ierr = cudaMallocHost((void **) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 311 ierr = cudaMallocHost((void **) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 312 313 /* Fill the upper triangular matrix */ 314 AiUp[0]=(PetscInt) 0; 315 AiUp[n]=nzUpper; 316 offset = nzUpper; 317 for (i=n-1; i>=0; i--){ 318 v = aa + adiag[i+1] + 1; 319 vi = aj + adiag[i+1] + 1; 320 321 // number of elements NOT on the diagonal 322 nz = adiag[i] - adiag[i+1]-1; 323 324 // decrement the offset 325 offset -= (nz+1); 326 327 // first, set the diagonal elements 328 AjUp[offset] = (PetscInt) i; 329 AAUp[offset] = 1./v[nz]; 330 AiUp[i] = AiUp[i+1] - (nz+1); 331 332 ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 333 ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 334 } 335 cusparseMat = GPU_Matrix_Factory::getNew(cusparseTriFactors->format); 336 stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 337 ierr = cusparseMat->setMatrix(n, n, nzUpper, AiUp, AjUp, AAUp);CHKERRCUSP(ierr); 338 stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat); 339 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMat; 340 // free temporary data 341 ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 342 ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 343 ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 344 } catch(char* ex) { 345 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 346 } 347 } 348 PetscFunctionReturn(0); 349 } 350 351 #undef __FUNCT__ 352 #define __FUNCT__ "MatCUSPARSEAnalysiAndCopyToGPU" 353 PetscErrorCode MatCUSPARSEAnalysisAndCopyToGPU(Mat A) 354 { 355 PetscErrorCode ierr; 356 Mat_SeqAIJ *a=(Mat_SeqAIJ *)A->data; 357 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 358 IS isrow = a->row,iscol = a->icol; 359 PetscBool row_identity,col_identity; 360 const PetscInt *r,*c; 361 PetscInt n = A->rmap->n; 362 363 PetscFunctionBegin; 364 ierr = MatCUSPARSEBuildLowerTriMatrix(A);CHKERRQ(ierr); 365 ierr = MatCUSPARSEBuildUpperTriMatrix(A);CHKERRQ(ierr); 366 cusparseTriFactors->tempvec = new CUSPARRAY; 367 cusparseTriFactors->tempvec->resize(n); 368 369 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 370 //lower triangular indices 371 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 372 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 373 if (!row_identity) 374 ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(r, n);CHKERRCUSP(ierr); 375 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 376 377 //upper triangular indices 378 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 379 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 380 if (!col_identity) 381 ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(c, n);CHKERRCUSP(ierr); 382 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 383 PetscFunctionReturn(0); 384 } 385 386 #undef __FUNCT__ 387 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE" 388 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 389 { 390 PetscErrorCode ierr; 391 Mat_SeqAIJ *b=(Mat_SeqAIJ *)B->data; 392 IS isrow = b->row,iscol = b->col; 393 PetscBool row_identity,col_identity; 394 395 PetscFunctionBegin; 396 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 397 // determine which version of MatSolve needs to be used. 398 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 399 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 400 if (row_identity && col_identity) B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 401 else B->ops->solve = MatSolve_SeqAIJCUSPARSE; 402 403 //if (!row_identity) printf("Row permutations exist!"); 404 //if (!col_identity) printf("Col permutations exist!"); 405 406 // get the triangular factors 407 ierr = MatCUSPARSEAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 408 PetscFunctionReturn(0); 409 } 410 411 412 413 #undef __FUNCT__ 414 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE" 415 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 416 { 417 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 418 PetscErrorCode ierr; 419 CUSPARRAY *xGPU, *bGPU; 420 cusparseStatus_t stat; 421 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 422 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 423 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 424 CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec; 425 426 PetscFunctionBegin; 427 // Get the GPU pointers 428 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 429 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 430 431 // solve with reordering 432 ierr = cusparseMatLo->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr); 433 stat = cusparseMatLo->solve(xGPU, tempGPU);CHKERRCUSP(stat); 434 stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat); 435 ierr = cusparseMatUp->reorderOut(xGPU);CHKERRCUSP(ierr); 436 437 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 438 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 439 ierr = WaitForGPU();CHKERRCUSP(ierr); 440 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 441 PetscFunctionReturn(0); 442 } 443 444 445 446 447 #undef __FUNCT__ 448 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering" 449 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 450 { 451 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 452 PetscErrorCode ierr; 453 CUSPARRAY *xGPU, *bGPU; 454 cusparseStatus_t stat; 455 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 456 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 457 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 458 CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec; 459 460 PetscFunctionBegin; 461 // Get the GPU pointers 462 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 463 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 464 465 // solve 466 stat = cusparseMatLo->solve(bGPU, tempGPU);CHKERRCUSP(stat); 467 stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat); 468 469 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 470 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 471 ierr = WaitForGPU();CHKERRCUSP(ierr); 472 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 473 PetscFunctionReturn(0); 474 } 475 476 #undef __FUNCT__ 477 #define __FUNCT__ "MatCUSPARSECopyToGPU" 478 PetscErrorCode MatCUSPARSECopyToGPU(Mat A) 479 { 480 481 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 482 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 483 PetscInt m = A->rmap->n,*ii,*ridx; 484 PetscErrorCode ierr; 485 486 487 PetscFunctionBegin; 488 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){ 489 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 490 /* 491 It may be possible to reuse nonzero structure with new matrix values but 492 for simplicity and insured correctness we delete and build a new matrix on 493 the GPU. Likely a very small performance hit. 494 */ 495 if (cusparseMat->mat){ 496 try { 497 delete cusparseMat->mat; 498 if (cusparseMat->tempvec) 499 delete cusparseMat->tempvec; 500 501 } catch(char* ex) { 502 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 503 } 504 } 505 try { 506 cusparseMat->nonzerorow=0; 507 for (int j = 0; j<m; j++) 508 cusparseMat->nonzerorow += ((a->i[j+1]-a->i[j])>0); 509 510 if (a->compressedrow.use) { 511 m = a->compressedrow.nrows; 512 ii = a->compressedrow.i; 513 ridx = a->compressedrow.rindex; 514 } else { 515 // Forcing compressed row on the GPU ... only relevant for CSR storage 516 int k=0; 517 ierr = PetscMalloc((cusparseMat->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr); 518 ierr = PetscMalloc((cusparseMat->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr); 519 ii[0]=0; 520 for (int j = 0; j<m; j++) { 521 if ((a->i[j+1]-a->i[j])>0) { 522 ii[k] = a->i[j]; 523 ridx[k]= j; 524 k++; 525 } 526 } 527 ii[cusparseMat->nonzerorow] = a->nz; 528 m = cusparseMat->nonzerorow; 529 } 530 531 // Build our matrix ... first determine the GPU storage type 532 cusparseMat->mat = GPU_Matrix_Factory::getNew(cusparseMat->format); 533 534 // Create the streams and events (if desired). 535 PetscMPIInt size; 536 ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr); 537 ierr = cusparseMat->mat->buildStreamsAndEvents(size, &theBodyStream);CHKERRCUSP(ierr); 538 539 // FILL MODE UPPER is irrelevant 540 cusparseStatus_t stat = cusparseMat->mat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 541 542 // lastly, build the matrix 543 ierr = cusparseMat->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);CHKERRCUSP(ierr); 544 cusparseMat->mat->setCPRowIndices(ridx, m); 545 // } 546 if (!a->compressedrow.use) { 547 // free data 548 ierr = PetscFree(ii);CHKERRQ(ierr); 549 ierr = PetscFree(ridx);CHKERRQ(ierr); 550 } 551 cusparseMat->tempvec = new CUSPARRAY; 552 cusparseMat->tempvec->resize(m); 553 //A->spptr = cusparseMat; 554 } catch(char* ex) { 555 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 556 } 557 ierr = WaitForGPU();CHKERRCUSP(ierr); 558 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 559 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 560 } 561 PetscFunctionReturn(0); 562 } 563 564 // #undef __FUNCT__ 565 // #define __FUNCT__ "MatCUSPARSECopyFromGPU" 566 // PetscErrorCode MatCUSPARSECopyFromGPU(Mat A, CUSPMATRIX *Agpu) 567 // { 568 // Mat_SeqAIJCUSPARSE *cuspstruct = (Mat_SeqAIJCUSPARSE *) A->spptr; 569 // Mat_SeqAIJ *a = (Mat_SeqAIJ *) A->data; 570 // PetscInt m = A->rmap->n; 571 // PetscErrorCode ierr; 572 573 // PetscFunctionBegin; 574 // if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED) { 575 // if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED) { 576 // try { 577 // cuspstruct->mat = Agpu; 578 // if (a->compressedrow.use) { 579 // //PetscInt *ii, *ridx; 580 // SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Cannot handle row compression for GPU matrices"); 581 // } else { 582 // PetscInt i; 583 584 // if (m+1 != (PetscInt) cuspstruct->mat->row_offsets.size()) {SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", cuspstruct->mat->row_offsets.size()-1, m);} 585 // a->nz = cuspstruct->mat->values.size(); 586 // a->maxnz = a->nz; /* Since we allocate exactly the right amount */ 587 // A->preallocated = PETSC_TRUE; 588 // // Copy ai, aj, aa 589 // if (a->singlemalloc) { 590 // if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);} 591 // } else { 592 // if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);} 593 // if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);} 594 // if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);} 595 // } 596 // ierr = PetscMalloc3(a->nz,PetscScalar,&a->a,a->nz,PetscInt,&a->j,m+1,PetscInt,&a->i);CHKERRQ(ierr); 597 // ierr = PetscLogObjectMemory(A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr); 598 // a->singlemalloc = PETSC_TRUE; 599 // thrust::copy(cuspstruct->mat->row_offsets.begin(), cuspstruct->mat->row_offsets.end(), a->i); 600 // thrust::copy(cuspstruct->mat->column_indices.begin(), cuspstruct->mat->column_indices.end(), a->j); 601 // thrust::copy(cuspstruct->mat->values.begin(), cuspstruct->mat->values.end(), a->a); 602 // // Setup row lengths 603 // if (a->imax) {ierr = PetscFree2(a->imax,a->ilen);CHKERRQ(ierr);} 604 // ierr = PetscMalloc2(m,PetscInt,&a->imax,m,PetscInt,&a->ilen);CHKERRQ(ierr); 605 // ierr = PetscLogObjectMemory(A, 2*m*sizeof(PetscInt));CHKERRQ(ierr); 606 // for(i = 0; i < m; ++i) { 607 // a->imax[i] = a->ilen[i] = a->i[i+1] - a->i[i]; 608 // } 609 // // a->diag? 610 // } 611 // cuspstruct->tempvec = new CUSPARRAY; 612 // cuspstruct->tempvec->resize(m); 613 // } catch(char *ex) { 614 // SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSP error: %s", ex); 615 // } 616 // } 617 // // This assembly prevents resetting the flag to PETSC_CUSP_CPU and recopying 618 // ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 619 // ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 620 // A->valid_GPU_matrix = PETSC_CUSP_BOTH; 621 // } else { 622 // SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Only valid for unallocated GPU matrices"); 623 // } 624 // PetscFunctionReturn(0); 625 // } 626 627 #undef __FUNCT__ 628 #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE" 629 PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left) 630 { 631 PetscErrorCode ierr; 632 633 PetscFunctionBegin; 634 635 if (right) { 636 ierr = VecCreate(((PetscObject)mat)->comm,right);CHKERRQ(ierr); 637 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 638 ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr); 639 ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr); 640 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 641 } 642 if (left) { 643 ierr = VecCreate(((PetscObject)mat)->comm,left);CHKERRQ(ierr); 644 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 645 ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr); 646 ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr); 647 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 648 } 649 PetscFunctionReturn(0); 650 } 651 652 #undef __FUNCT__ 653 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE" 654 PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 655 { 656 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 657 PetscErrorCode ierr; 658 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr; 659 CUSPARRAY *xarray,*yarray; 660 661 PetscFunctionBegin; 662 // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 663 // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr); 664 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 665 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 666 try { 667 ierr = cusparseMat->mat->multiply(xarray, yarray);CHKERRCUSP(ierr); 668 } catch (char* ex) { 669 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 670 } 671 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 672 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 673 if (!cusparseMat->mat->hasNonZeroStream()) { 674 ierr = WaitForGPU();CHKERRCUSP(ierr); 675 } 676 ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr); 677 PetscFunctionReturn(0); 678 } 679 680 681 #undef __FUNCT__ 682 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE" 683 PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 684 { 685 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 686 PetscErrorCode ierr; 687 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr; 688 CUSPARRAY *xarray,*yarray; 689 690 PetscFunctionBegin; 691 // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 692 // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr); 693 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 694 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 695 try { 696 #if !defined(PETSC_USE_COMPLEX) 697 ierr = cusparseMat->mat->multiply(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr); 698 #else 699 ierr = cusparseMat->mat->multiply(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr); 700 #endif 701 } catch (char* ex) { 702 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 703 } 704 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 705 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 706 if (!cusparseMat->mat->hasNonZeroStream()) { 707 ierr = WaitForGPU();CHKERRCUSP(ierr); 708 } 709 ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr); 710 PetscFunctionReturn(0); 711 } 712 713 #undef __FUNCT__ 714 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 715 PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 716 { 717 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 718 PetscErrorCode ierr; 719 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr; 720 CUSPARRAY *xarray,*yarray,*zarray; 721 PetscFunctionBegin; 722 // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 723 // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr); 724 try { 725 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 726 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 727 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 728 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 729 730 // multiply add 731 ierr = cusparseMat->mat->multiplyAdd(xarray, zarray);CHKERRCUSP(ierr); 732 733 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 734 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 735 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 736 737 } catch(char* ex) { 738 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 739 } 740 ierr = WaitForGPU();CHKERRCUSP(ierr); 741 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 742 PetscFunctionReturn(0); 743 } 744 745 #undef __FUNCT__ 746 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 747 PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 748 { 749 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 750 PetscErrorCode ierr; 751 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr; 752 CUSPARRAY *xarray,*yarray,*zarray; 753 PetscFunctionBegin; 754 // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 755 // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr); 756 try { 757 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 758 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 759 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 760 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 761 762 // multiply add with matrix transpose 763 #if !defined(PETSC_USE_COMPLEX) 764 ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr); 765 #else 766 ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr); 767 #endif 768 769 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 770 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 771 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 772 773 } catch(char* ex) { 774 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 775 } 776 ierr = WaitForGPU();CHKERRCUSP(ierr); 777 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 778 PetscFunctionReturn(0); 779 } 780 781 #undef __FUNCT__ 782 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE" 783 PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 784 { 785 PetscErrorCode ierr; 786 PetscFunctionBegin; 787 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 788 ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr); 789 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 790 // this is not necessary since MatCUSPARSECopyToGPU has been called. 791 //if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED){ 792 // A->valid_GPU_matrix = PETSC_CUSP_CPU; 793 //} 794 A->ops->mult = MatMult_SeqAIJCUSPARSE; 795 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 796 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 797 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 798 PetscFunctionReturn(0); 799 } 800 801 /* --------------------------------------------------------------------------------*/ 802 #undef __FUNCT__ 803 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE" 804 /*@C 805 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 806 (the default parallel PETSc format). For good matrix assembly performance 807 the user should preallocate the matrix storage by setting the parameter nz 808 (or the array nnz). By setting these parameters accurately, performance 809 during matrix assembly can be increased by more than a factor of 50. 810 811 Collective on MPI_Comm 812 813 Input Parameters: 814 + comm - MPI communicator, set to PETSC_COMM_SELF 815 . m - number of rows 816 . n - number of columns 817 . nz - number of nonzeros per row (same for all rows) 818 - nnz - array containing the number of nonzeros in the various rows 819 (possibly different for each row) or PETSC_NULL 820 821 Output Parameter: 822 . A - the matrix 823 824 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 825 MatXXXXSetPreallocation() paradgm instead of this routine directly. 826 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 827 828 Notes: 829 If nnz is given then nz is ignored 830 831 The AIJ format (also called the Yale sparse matrix format or 832 compressed row storage), is fully compatible with standard Fortran 77 833 storage. That is, the stored row and column indices can begin at 834 either one (as in Fortran) or zero. See the users' manual for details. 835 836 Specify the preallocated storage with either nz or nnz (not both). 837 Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory 838 allocation. For large problems you MUST preallocate memory or you 839 will get TERRIBLE performance, see the users' manual chapter on matrices. 840 841 By default, this format uses inodes (identical nodes) when possible, to 842 improve numerical efficiency of matrix-vector products and solves. We 843 search for consecutive rows with the same nonzero structure, thereby 844 reusing matrix information to achieve increased efficiency. 845 846 Level: intermediate 847 848 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 849 850 @*/ 851 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 852 { 853 PetscErrorCode ierr; 854 855 PetscFunctionBegin; 856 ierr = MatCreate(comm,A);CHKERRQ(ierr); 857 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 858 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 859 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 860 PetscFunctionReturn(0); 861 } 862 863 #undef __FUNCT__ 864 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE" 865 PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 866 { 867 PetscErrorCode ierr; 868 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 869 870 PetscFunctionBegin; 871 if (A->factortype==MAT_FACTOR_NONE) { 872 // The regular matrices 873 try { 874 if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED){ 875 delete (GPU_Matrix_Ifc *)(cusparseMat->mat); 876 } 877 if (cusparseMat->tempvec!=0) 878 delete cusparseMat->tempvec; 879 delete cusparseMat; 880 A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 881 } catch(char* ex) { 882 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 883 } 884 } else { 885 // The triangular factors 886 try { 887 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 888 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 889 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 890 // destroy the matrix and the container 891 delete (GPU_Matrix_Ifc *)cusparseMatLo; 892 delete (GPU_Matrix_Ifc *)cusparseMatUp; 893 delete (CUSPARRAY*) cusparseTriFactors->tempvec; 894 delete cusparseTriFactors; 895 } catch(char* ex) { 896 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 897 } 898 } 899 if (MAT_cusparseHandle) { 900 cusparseStatus_t stat; 901 stat = cusparseDestroy(MAT_cusparseHandle);CHKERRCUSP(stat); 902 MAT_cusparseHandle=0; 903 } 904 /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */ 905 A->spptr = 0; 906 907 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 908 PetscFunctionReturn(0); 909 } 910 911 912 EXTERN_C_BEGIN 913 #undef __FUNCT__ 914 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE" 915 PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 916 { 917 PetscErrorCode ierr; 918 GPUStorageFormat format = CSR; 919 920 PetscFunctionBegin; 921 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 922 if (B->factortype==MAT_FACTOR_NONE) { 923 /* you cannot check the inode.use flag here since the matrix was just created.*/ 924 // now build a GPU matrix data structure 925 B->spptr = new Mat_SeqAIJCUSPARSE; 926 ((Mat_SeqAIJCUSPARSE *)B->spptr)->mat = 0; 927 ((Mat_SeqAIJCUSPARSE *)B->spptr)->tempvec = 0; 928 ((Mat_SeqAIJCUSPARSE *)B->spptr)->format = format; 929 } else { 930 /* NEXT, set the pointers to the triangular factors */ 931 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 932 ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->loTriFactorPtr = 0; 933 ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->upTriFactorPtr = 0; 934 ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->tempvec = 0; 935 ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->format = format; 936 } 937 // Create a single instance of the MAT_cusparseHandle for any matrix (matMult, TriSolve, ...) 938 if (!MAT_cusparseHandle) { 939 cusparseStatus_t stat; 940 stat = cusparseCreate(&MAT_cusparseHandle);CHKERRCUSP(stat); 941 } 942 // Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the 943 // default cusparse tri solve. Note the difference with the implementation in 944 // MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu 945 ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatGetFactor_petsc_C","MatGetFactor_seqaij_cusparse",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr); 946 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 947 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 948 B->ops->getvecs = MatGetVecs_SeqAIJCUSPARSE; 949 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 950 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 951 B->ops->mult = MatMult_SeqAIJCUSPARSE; 952 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 953 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 954 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 955 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 956 B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 957 PetscFunctionReturn(0); 958 } 959 EXTERN_C_END 960 961