1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format. 4 */ 5 6 #include "petscconf.h" 7 PETSC_CUDA_EXTERN_C_BEGIN 8 #include "../src/mat/impls/aij/seq/aij.h" /*I "petscmat.h" I*/ 9 //#include "petscbt.h" 10 #include "../src/vec/vec/impls/dvecimpl.h" 11 #include "petsc-private/vecimpl.h" 12 PETSC_CUDA_EXTERN_C_END 13 #undef VecType 14 #include "cusparsematimpl.h" 15 16 // this is such a hack ... but I don't know of another way to pass this variable 17 // from one GPU_Matrix_Ifc class to another. This is necessary for the parallel 18 // SpMV. Essentially, I need to use the same stream variable in two different 19 // data structures. I do this by creating a single instance of that stream 20 // and reuse it. 21 cudaStream_t theBodyStream=0; 22 23 EXTERN_C_BEGIN 24 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 25 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 26 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo *); 27 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 28 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 29 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat); 30 PetscErrorCode MatCUSPARSEAnalysisAndCopyToGPU(Mat); 31 EXTERN_C_END 32 33 EXTERN_C_BEGIN 34 #undef __FUNCT__ 35 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse" 36 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type) 37 { 38 PetscFunctionBegin; 39 *type = MATSOLVERCUSPARSE; 40 PetscFunctionReturn(0); 41 } 42 EXTERN_C_END 43 44 EXTERN_C_BEGIN 45 extern PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*); 46 EXTERN_C_END 47 48 49 50 /*MC 51 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers (ILU) for seq matrices 52 on the GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp 53 54 ./configure --download-txpetscgpu to install PETSc to use CUSPARSE solves 55 56 Options Database Keys: 57 + -mat_mult_cusparse_storage_format <CSR> - (choose one of) CSR, ELL, HYB 58 + -mat_solve_cusparse_storage_format <CSR> - (choose one of) CSR, ELL, HYB 59 60 Level: beginner 61 M*/ 62 63 EXTERN_C_BEGIN 64 #undef __FUNCT__ 65 #define __FUNCT__ "MatGetFactor_seqaij_cusparse" 66 PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B) 67 { 68 PetscErrorCode ierr; 69 70 PetscFunctionBegin; 71 ierr = MatGetFactor_seqaij_petsc(A,ftype,B);CHKERRQ(ierr); 72 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT){ 73 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 74 ierr = MatSetFromOptions_SeqAIJCUSPARSE(*B);CHKERRQ(ierr); 75 ierr = PetscObjectComposeFunctionDynamic((PetscObject)(*B),"MatFactorGetSolverPackage_C","MatFactorGetSolverPackage_seqaij_cusparse",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr); 76 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 77 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 78 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 79 (*B)->factortype = ftype; 80 PetscFunctionReturn(0); 81 } 82 EXTERN_C_END 83 84 #undef __FUNCT__ 85 #define __FUNCT__ "MatAIJCUSPARSESetGPUStorageFormatForMatSolve" 86 PetscErrorCode MatAIJCUSPARSESetGPUStorageFormatForMatSolve(Mat A,GPUStorageFormat format) 87 { 88 PetscErrorCode ierr; 89 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 90 GPU_Matrix_Ifc* cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 91 GPU_Matrix_Ifc* cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 92 PetscFunctionBegin; 93 cusparseTriFactors->format = format; 94 if (cusparseMatLo && cusparseMatUp) { 95 // If data exists already delete 96 if (cusparseMatLo) 97 delete (GPU_Matrix_Ifc *)cusparseMatLo; 98 if (cusparseMatUp) 99 delete (GPU_Matrix_Ifc *)cusparseMatUp; 100 101 // key data was deleted so reset the flag. 102 A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 103 104 // rebuild matrix 105 ierr=MatCUSPARSEAnalysisAndCopyToGPU(A);CHKERRQ(ierr); 106 } 107 PetscFunctionReturn(0); 108 } 109 110 //EXTERN_C_BEGIN 111 #undef __FUNCT__ 112 #define __FUNCT__ "MatSetOption_SeqAIJCUSPARSE" 113 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 114 { 115 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 116 PetscErrorCode ierr; 117 PetscFunctionBegin; 118 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 119 switch (op) { 120 case MAT_DIAGBLOCK_CSR: 121 case MAT_OFFDIAGBLOCK_CSR: 122 case MAT_CSR: 123 cusparseMat->format = CSR; 124 break; 125 case MAT_DIAGBLOCK_DIA: 126 case MAT_OFFDIAGBLOCK_DIA: 127 case MAT_DIA: 128 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unsupported GPU matrix storage format DIA for (MPI,SEQ)AIJCUSPARSE matrix type."); 129 case MAT_DIAGBLOCK_ELL: 130 case MAT_OFFDIAGBLOCK_ELL: 131 case MAT_ELL: 132 cusparseMat->format = ELL; 133 break; 134 case MAT_DIAGBLOCK_HYB: 135 case MAT_OFFDIAGBLOCK_HYB: 136 case MAT_HYB: 137 cusparseMat->format = HYB; 138 break; 139 default: 140 break; 141 } 142 PetscFunctionReturn(0); 143 } 144 //EXTERN_C_END 145 146 //EXTERN_C_BEGIN 147 #undef __FUNCT__ 148 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE" 149 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A) 150 { 151 PetscErrorCode ierr; 152 PetscInt idx=0; 153 char * formats[] ={CSR,ELL,HYB}; 154 MatOption format; 155 PetscBool flg; 156 PetscFunctionBegin; 157 ierr = PetscOptionsBegin(((PetscObject)A)->comm,((PetscObject)A)->prefix,"When using TxPETSCGPU, AIJCUSPARSE Options","Mat");CHKERRQ(ierr); 158 if (A->factortype==MAT_FACTOR_NONE) { 159 ierr = PetscOptionsEList("-mat_mult_cusparse_storage_format", 160 "Set the storage format of (seq)aijcusparse gpu matrices for SpMV", 161 "None",formats,3,formats[0],&idx,&flg);CHKERRQ(ierr); 162 if (formats[idx] == CSR) 163 format=MAT_CSR; 164 else if (formats[idx] == ELL) 165 format=MAT_ELL; 166 else if (formats[idx] == HYB) 167 format=MAT_HYB; 168 169 ierr=MatSetOption_SeqAIJCUSPARSE(A,format,PETSC_TRUE);CHKERRQ(ierr); 170 } 171 else { 172 ierr = PetscOptionsEList("-mat_solve_cusparse_storage_format", 173 "Set the storage format of (seq)aijcusparse gpu matrices for TriSolve", 174 "None",formats,3,formats[0],&idx,&flg);CHKERRQ(ierr); 175 ierr=MatAIJCUSPARSESetGPUStorageFormatForMatSolve(A,formats[idx]);CHKERRQ(ierr); 176 } 177 ierr = PetscOptionsEnd();CHKERRQ(ierr); 178 PetscFunctionReturn(0); 179 180 } 181 //EXTERN_C_END 182 183 #undef __FUNCT__ 184 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE" 185 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 186 { 187 PetscErrorCode ierr; 188 189 PetscFunctionBegin; 190 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 191 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 192 PetscFunctionReturn(0); 193 } 194 195 #undef __FUNCT__ 196 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE" 197 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 198 { 199 PetscErrorCode ierr; 200 201 PetscFunctionBegin; 202 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 203 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 204 PetscFunctionReturn(0); 205 } 206 207 #undef __FUNCT__ 208 #define __FUNCT__ "MatCUSPARSEBuildLowerTriMatrix" 209 PetscErrorCode MatCUSPARSEBuildLowerTriMatrix(Mat A) 210 { 211 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 212 PetscInt n = A->rmap->n; 213 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 214 GPU_Matrix_Ifc* cusparseMat = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 215 cusparseStatus_t stat; 216 const PetscInt *ai = a->i,*aj = a->j,*vi; 217 const MatScalar *aa = a->a,*v; 218 PetscErrorCode ierr; 219 PetscInt *AiLo, *AjLo; 220 PetscScalar *AALo; 221 PetscInt i,nz, nzLower, offset, rowOffset; 222 223 PetscFunctionBegin; 224 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){ 225 try { 226 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 227 nzLower=n+ai[n]-ai[1]; 228 229 /* Allocate Space for the lower triangular matrix */ 230 ierr = cudaMallocHost((void **) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 231 ierr = cudaMallocHost((void **) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr); 232 ierr = cudaMallocHost((void **) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr); 233 234 /* Fill the lower triangular matrix */ 235 AiLo[0]=(PetscInt) 0; 236 AiLo[n]=nzLower; 237 AjLo[0]=(PetscInt) 0; 238 AALo[0]=(MatScalar) 1.0; 239 v = aa; 240 vi = aj; 241 offset=1; 242 rowOffset=1; 243 for (i=1; i<n; i++) { 244 nz = ai[i+1] - ai[i]; 245 // additional 1 for the term on the diagonal 246 AiLo[i]=rowOffset; 247 rowOffset+=nz+1; 248 249 ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 250 ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 251 252 offset+=nz; 253 AjLo[offset]=(PetscInt) i; 254 AALo[offset]=(MatScalar) 1.0; 255 offset+=1; 256 257 v += nz; 258 vi += nz; 259 } 260 cusparseMat = GPU_Matrix_Factory::getNew(cusparseTriFactors->format); 261 stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat); 262 ierr = cusparseMat->setMatrix(n, n, nzLower, AiLo, AjLo, AALo);CHKERRCUSP(ierr); 263 stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat); 264 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMat; 265 // free temporary data 266 ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr); 267 ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr); 268 ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 269 } catch(char* ex) { 270 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 271 } 272 } 273 PetscFunctionReturn(0); 274 } 275 276 #undef __FUNCT__ 277 #define __FUNCT__ "MatCUSPARSEBuildUpperTriMatrix" 278 PetscErrorCode MatCUSPARSEBuildUpperTriMatrix(Mat A) 279 { 280 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 281 PetscInt n = A->rmap->n; 282 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 283 GPU_Matrix_Ifc* cusparseMat = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 284 cusparseStatus_t stat; 285 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 286 const MatScalar *aa = a->a,*v; 287 PetscInt *AiUp, *AjUp; 288 PetscScalar *AAUp; 289 PetscInt i,nz, nzUpper, offset; 290 PetscErrorCode ierr; 291 292 PetscFunctionBegin; 293 294 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){ 295 try { 296 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 297 nzUpper = adiag[0]-adiag[n]; 298 299 /* Allocate Space for the upper triangular matrix */ 300 ierr = cudaMallocHost((void **) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 301 ierr = cudaMallocHost((void **) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 302 ierr = cudaMallocHost((void **) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 303 304 /* Fill the upper triangular matrix */ 305 AiUp[0]=(PetscInt) 0; 306 AiUp[n]=nzUpper; 307 offset = nzUpper; 308 for (i=n-1; i>=0; i--){ 309 v = aa + adiag[i+1] + 1; 310 vi = aj + adiag[i+1] + 1; 311 312 // number of elements NOT on the diagonal 313 nz = adiag[i] - adiag[i+1]-1; 314 315 // decrement the offset 316 offset -= (nz+1); 317 318 // first, set the diagonal elements 319 AjUp[offset] = (PetscInt) i; 320 AAUp[offset] = 1./v[nz]; 321 AiUp[i] = AiUp[i+1] - (nz+1); 322 323 ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 324 ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 325 } 326 cusparseMat = GPU_Matrix_Factory::getNew(cusparseTriFactors->format); 327 stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 328 ierr = cusparseMat->setMatrix(n, n, nzUpper, AiUp, AjUp, AAUp);CHKERRCUSP(ierr); 329 stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat); 330 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMat; 331 // free temporary data 332 ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 333 ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 334 ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 335 } catch(char* ex) { 336 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 337 } 338 } 339 PetscFunctionReturn(0); 340 } 341 342 #undef __FUNCT__ 343 #define __FUNCT__ "MatCUSPARSEAnalysiAndCopyToGPU" 344 PetscErrorCode MatCUSPARSEAnalysisAndCopyToGPU(Mat A) 345 { 346 PetscErrorCode ierr; 347 Mat_SeqAIJ *a=(Mat_SeqAIJ *)A->data; 348 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 349 IS isrow = a->row,iscol = a->icol; 350 PetscBool row_identity,col_identity; 351 const PetscInt *r,*c; 352 PetscInt n = A->rmap->n; 353 354 PetscFunctionBegin; 355 ierr = MatCUSPARSEBuildLowerTriMatrix(A);CHKERRQ(ierr); 356 ierr = MatCUSPARSEBuildUpperTriMatrix(A);CHKERRQ(ierr); 357 cusparseTriFactors->tempvec = new CUSPARRAY; 358 cusparseTriFactors->tempvec->resize(n); 359 360 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 361 //lower triangular indices 362 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 363 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 364 if (!row_identity) 365 ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(r, n);CHKERRCUSP(ierr); 366 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 367 368 //upper triangular indices 369 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 370 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 371 if (!col_identity) 372 ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(c, n);CHKERRCUSP(ierr); 373 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 374 PetscFunctionReturn(0); 375 } 376 377 #undef __FUNCT__ 378 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE" 379 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 380 { 381 PetscErrorCode ierr; 382 Mat_SeqAIJ *b=(Mat_SeqAIJ *)B->data; 383 IS isrow = b->row,iscol = b->col; 384 PetscBool row_identity,col_identity; 385 386 PetscFunctionBegin; 387 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 388 // determine which version of MatSolve needs to be used. 389 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 390 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 391 if (row_identity && col_identity) B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 392 else B->ops->solve = MatSolve_SeqAIJCUSPARSE; 393 //if (!row_identity) printf("Row permutations exist!"); 394 //if (!col_identity) printf("Col permutations exist!"); 395 396 // get the triangular factors 397 ierr = MatCUSPARSEAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 398 PetscFunctionReturn(0); 399 } 400 401 402 403 #undef __FUNCT__ 404 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE" 405 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 406 { 407 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 408 PetscErrorCode ierr; 409 CUSPARRAY *xGPU, *bGPU; 410 cusparseStatus_t stat; 411 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 412 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 413 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 414 CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec; 415 416 PetscFunctionBegin; 417 // Get the GPU pointers 418 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 419 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 420 421 // solve with reordering 422 ierr = cusparseMatLo->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr); 423 stat = cusparseMatLo->solve(xGPU, tempGPU);CHKERRCUSP(stat); 424 stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat); 425 ierr = cusparseMatUp->reorderOut(xGPU);CHKERRCUSP(ierr); 426 427 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 428 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 429 ierr = WaitForGPU();CHKERRCUSP(ierr); 430 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 431 PetscFunctionReturn(0); 432 } 433 434 435 436 437 #undef __FUNCT__ 438 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering" 439 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 440 { 441 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 442 PetscErrorCode ierr; 443 CUSPARRAY *xGPU, *bGPU; 444 cusparseStatus_t stat; 445 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 446 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 447 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 448 CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec; 449 450 PetscFunctionBegin; 451 // Get the GPU pointers 452 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 453 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 454 455 // solve 456 stat = cusparseMatLo->solve(bGPU, tempGPU);CHKERRCUSP(stat); 457 stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat); 458 459 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 460 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 461 ierr = WaitForGPU();CHKERRCUSP(ierr); 462 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 463 PetscFunctionReturn(0); 464 } 465 466 #undef __FUNCT__ 467 #define __FUNCT__ "MatCUSPARSECopyToGPU" 468 PetscErrorCode MatCUSPARSECopyToGPU(Mat A) 469 { 470 471 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 472 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 473 PetscInt m = A->rmap->n,*ii,*ridx; 474 PetscErrorCode ierr; 475 476 477 PetscFunctionBegin; 478 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){ 479 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 480 /* 481 It may be possible to reuse nonzero structure with new matrix values but 482 for simplicity and insured correctness we delete and build a new matrix on 483 the GPU. Likely a very small performance hit. 484 */ 485 if (cusparseMat->mat){ 486 try { 487 delete cusparseMat->mat; 488 if (cusparseMat->tempvec) 489 delete cusparseMat->tempvec; 490 491 } catch(char* ex) { 492 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 493 } 494 } 495 try { 496 cusparseMat->nonzerorow=0; 497 for (int j = 0; j<m; j++) 498 cusparseMat->nonzerorow += ((a->i[j+1]-a->i[j])>0); 499 500 if (a->compressedrow.use) { 501 m = a->compressedrow.nrows; 502 ii = a->compressedrow.i; 503 ridx = a->compressedrow.rindex; 504 } else { 505 // Forcing compressed row on the GPU ... only relevant for CSR storage 506 int k=0; 507 ierr = PetscMalloc((cusparseMat->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr); 508 ierr = PetscMalloc((cusparseMat->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr); 509 ii[0]=0; 510 for (int j = 0; j<m; j++) { 511 if ((a->i[j+1]-a->i[j])>0) { 512 ii[k] = a->i[j]; 513 ridx[k]= j; 514 k++; 515 } 516 } 517 ii[cusparseMat->nonzerorow] = a->nz; 518 m = cusparseMat->nonzerorow; 519 } 520 521 // Build our matrix ... first determine the GPU storage type 522 cusparseMat->mat = GPU_Matrix_Factory::getNew(cusparseMat->format); 523 524 // Create the streams and events (if desired). 525 PetscMPIInt size; 526 ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr); 527 ierr = cusparseMat->mat->buildStreamsAndEvents(size, &theBodyStream);CHKERRCUSP(ierr); 528 529 // FILL MODE UPPER is irrelevant 530 cusparseStatus_t stat = cusparseMat->mat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 531 532 // lastly, build the matrix 533 ierr = cusparseMat->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);CHKERRCUSP(ierr); 534 cusparseMat->mat->setCPRowIndices(ridx, m); 535 // } 536 if (!a->compressedrow.use) { 537 // free data 538 ierr = PetscFree(ii);CHKERRQ(ierr); 539 ierr = PetscFree(ridx);CHKERRQ(ierr); 540 } 541 cusparseMat->tempvec = new CUSPARRAY; 542 cusparseMat->tempvec->resize(m); 543 //A->spptr = cusparseMat; 544 } catch(char* ex) { 545 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 546 } 547 ierr = WaitForGPU();CHKERRCUSP(ierr); 548 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 549 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 550 } 551 PetscFunctionReturn(0); 552 } 553 554 // #undef __FUNCT__ 555 // #define __FUNCT__ "MatCUSPARSECopyFromGPU" 556 // PetscErrorCode MatCUSPARSECopyFromGPU(Mat A, CUSPMATRIX *Agpu) 557 // { 558 // Mat_SeqAIJCUSPARSE *cuspstruct = (Mat_SeqAIJCUSPARSE *) A->spptr; 559 // Mat_SeqAIJ *a = (Mat_SeqAIJ *) A->data; 560 // PetscInt m = A->rmap->n; 561 // PetscErrorCode ierr; 562 563 // PetscFunctionBegin; 564 // if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED) { 565 // if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED) { 566 // try { 567 // cuspstruct->mat = Agpu; 568 // if (a->compressedrow.use) { 569 // //PetscInt *ii, *ridx; 570 // SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Cannot handle row compression for GPU matrices"); 571 // } else { 572 // PetscInt i; 573 574 // if (m+1 != (PetscInt) cuspstruct->mat->row_offsets.size()) {SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", cuspstruct->mat->row_offsets.size()-1, m);} 575 // a->nz = cuspstruct->mat->values.size(); 576 // a->maxnz = a->nz; /* Since we allocate exactly the right amount */ 577 // A->preallocated = PETSC_TRUE; 578 // // Copy ai, aj, aa 579 // if (a->singlemalloc) { 580 // if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);} 581 // } else { 582 // if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);} 583 // if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);} 584 // if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);} 585 // } 586 // ierr = PetscMalloc3(a->nz,PetscScalar,&a->a,a->nz,PetscInt,&a->j,m+1,PetscInt,&a->i);CHKERRQ(ierr); 587 // ierr = PetscLogObjectMemory(A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr); 588 // a->singlemalloc = PETSC_TRUE; 589 // thrust::copy(cuspstruct->mat->row_offsets.begin(), cuspstruct->mat->row_offsets.end(), a->i); 590 // thrust::copy(cuspstruct->mat->column_indices.begin(), cuspstruct->mat->column_indices.end(), a->j); 591 // thrust::copy(cuspstruct->mat->values.begin(), cuspstruct->mat->values.end(), a->a); 592 // // Setup row lengths 593 // if (a->imax) {ierr = PetscFree2(a->imax,a->ilen);CHKERRQ(ierr);} 594 // ierr = PetscMalloc2(m,PetscInt,&a->imax,m,PetscInt,&a->ilen);CHKERRQ(ierr); 595 // ierr = PetscLogObjectMemory(A, 2*m*sizeof(PetscInt));CHKERRQ(ierr); 596 // for(i = 0; i < m; ++i) { 597 // a->imax[i] = a->ilen[i] = a->i[i+1] - a->i[i]; 598 // } 599 // // a->diag? 600 // } 601 // cuspstruct->tempvec = new CUSPARRAY; 602 // cuspstruct->tempvec->resize(m); 603 // } catch(char *ex) { 604 // SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSP error: %s", ex); 605 // } 606 // } 607 // // This assembly prevents resetting the flag to PETSC_CUSP_CPU and recopying 608 // ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 609 // ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 610 // A->valid_GPU_matrix = PETSC_CUSP_BOTH; 611 // } else { 612 // SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Only valid for unallocated GPU matrices"); 613 // } 614 // PetscFunctionReturn(0); 615 // } 616 617 #undef __FUNCT__ 618 #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE" 619 PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left) 620 { 621 PetscErrorCode ierr; 622 623 PetscFunctionBegin; 624 625 if (right) { 626 ierr = VecCreate(((PetscObject)mat)->comm,right);CHKERRQ(ierr); 627 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 628 ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr); 629 ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr); 630 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 631 } 632 if (left) { 633 ierr = VecCreate(((PetscObject)mat)->comm,left);CHKERRQ(ierr); 634 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 635 ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr); 636 ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr); 637 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 638 } 639 PetscFunctionReturn(0); 640 } 641 642 #undef __FUNCT__ 643 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE" 644 PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 645 { 646 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 647 PetscErrorCode ierr; 648 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr; 649 CUSPARRAY *xarray,*yarray; 650 651 PetscFunctionBegin; 652 // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 653 // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr); 654 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 655 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 656 try { 657 ierr = cusparseMat->mat->multiply(xarray, yarray);CHKERRCUSP(ierr); 658 } catch (char* ex) { 659 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 660 } 661 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 662 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 663 if (!cusparseMat->mat->hasNonZeroStream()) { 664 ierr = WaitForGPU();CHKERRCUSP(ierr); 665 } 666 ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr); 667 PetscFunctionReturn(0); 668 } 669 670 671 #undef __FUNCT__ 672 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE" 673 PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 674 { 675 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 676 PetscErrorCode ierr; 677 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr; 678 CUSPARRAY *xarray,*yarray; 679 680 PetscFunctionBegin; 681 // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 682 // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr); 683 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 684 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 685 try { 686 #if !defined(PETSC_USE_COMPLEX) 687 ierr = cusparseMat->mat->multiply(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr); 688 #else 689 ierr = cusparseMat->mat->multiply(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr); 690 #endif 691 } catch (char* ex) { 692 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 693 } 694 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 695 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 696 if (!cusparseMat->mat->hasNonZeroStream()) { 697 ierr = WaitForGPU();CHKERRCUSP(ierr); 698 } 699 ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr); 700 PetscFunctionReturn(0); 701 } 702 703 #undef __FUNCT__ 704 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 705 PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 706 { 707 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 708 PetscErrorCode ierr; 709 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr; 710 CUSPARRAY *xarray,*yarray,*zarray; 711 PetscFunctionBegin; 712 // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 713 // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr); 714 try { 715 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 716 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 717 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 718 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 719 720 // multiply add 721 ierr = cusparseMat->mat->multiplyAdd(xarray, zarray);CHKERRCUSP(ierr); 722 723 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 724 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 725 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 726 727 } catch(char* ex) { 728 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 729 } 730 ierr = WaitForGPU();CHKERRCUSP(ierr); 731 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 732 PetscFunctionReturn(0); 733 } 734 735 #undef __FUNCT__ 736 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 737 PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 738 { 739 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 740 PetscErrorCode ierr; 741 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr; 742 CUSPARRAY *xarray,*yarray,*zarray; 743 PetscFunctionBegin; 744 // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 745 // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr); 746 try { 747 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 748 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 749 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 750 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 751 752 // multiply add with matrix transpose 753 #if !defined(PETSC_USE_COMPLEX) 754 ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr); 755 #else 756 ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr); 757 #endif 758 759 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 760 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 761 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 762 763 } catch(char* ex) { 764 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 765 } 766 ierr = WaitForGPU();CHKERRCUSP(ierr); 767 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 768 PetscFunctionReturn(0); 769 } 770 771 #undef __FUNCT__ 772 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE" 773 PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 774 { 775 PetscErrorCode ierr; 776 PetscFunctionBegin; 777 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 778 ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr); 779 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 780 // this is not necessary since MatCUSPARSECopyToGPU has been called. 781 //if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED){ 782 // A->valid_GPU_matrix = PETSC_CUSP_CPU; 783 //} 784 A->ops->mult = MatMult_SeqAIJCUSPARSE; 785 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 786 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 787 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 788 PetscFunctionReturn(0); 789 } 790 791 /* --------------------------------------------------------------------------------*/ 792 #undef __FUNCT__ 793 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE" 794 /*@C 795 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 796 (the default parallel PETSc format). For good matrix assembly performance 797 the user should preallocate the matrix storage by setting the parameter nz 798 (or the array nnz). By setting these parameters accurately, performance 799 during matrix assembly can be increased by more than a factor of 50. 800 801 Collective on MPI_Comm 802 803 Input Parameters: 804 + comm - MPI communicator, set to PETSC_COMM_SELF 805 . m - number of rows 806 . n - number of columns 807 . nz - number of nonzeros per row (same for all rows) 808 - nnz - array containing the number of nonzeros in the various rows 809 (possibly different for each row) or PETSC_NULL 810 811 Output Parameter: 812 . A - the matrix 813 814 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 815 MatXXXXSetPreallocation() paradgm instead of this routine directly. 816 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 817 818 Notes: 819 If nnz is given then nz is ignored 820 821 The AIJ format (also called the Yale sparse matrix format or 822 compressed row storage), is fully compatible with standard Fortran 77 823 storage. That is, the stored row and column indices can begin at 824 either one (as in Fortran) or zero. See the users' manual for details. 825 826 Specify the preallocated storage with either nz or nnz (not both). 827 Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory 828 allocation. For large problems you MUST preallocate memory or you 829 will get TERRIBLE performance, see the users' manual chapter on matrices. 830 831 By default, this format uses inodes (identical nodes) when possible, to 832 improve numerical efficiency of matrix-vector products and solves. We 833 search for consecutive rows with the same nonzero structure, thereby 834 reusing matrix information to achieve increased efficiency. 835 836 Level: intermediate 837 838 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 839 840 @*/ 841 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 842 { 843 PetscErrorCode ierr; 844 845 PetscFunctionBegin; 846 ierr = MatCreate(comm,A);CHKERRQ(ierr); 847 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 848 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 849 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 850 PetscFunctionReturn(0); 851 } 852 853 #undef __FUNCT__ 854 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE" 855 PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 856 { 857 PetscErrorCode ierr; 858 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 859 860 PetscFunctionBegin; 861 if (A->factortype==MAT_FACTOR_NONE) { 862 // The regular matrices 863 try { 864 if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED){ 865 delete (GPU_Matrix_Ifc *)(cusparseMat->mat); 866 } 867 if (cusparseMat->tempvec!=0) 868 delete cusparseMat->tempvec; 869 delete cusparseMat; 870 A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 871 } catch(char* ex) { 872 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 873 } 874 } else { 875 // The triangular factors 876 try { 877 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 878 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 879 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 880 // destroy the matrix and the container 881 delete (GPU_Matrix_Ifc *)cusparseMatLo; 882 delete (GPU_Matrix_Ifc *)cusparseMatUp; 883 delete (CUSPARRAY*) cusparseTriFactors->tempvec; 884 delete cusparseTriFactors; 885 } catch(char* ex) { 886 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 887 } 888 } 889 if (MAT_cusparseHandle) { 890 cusparseStatus_t stat; 891 stat = cusparseDestroy(MAT_cusparseHandle);CHKERRCUSP(stat); 892 MAT_cusparseHandle=0; 893 } 894 /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */ 895 A->spptr = 0; 896 897 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 898 PetscFunctionReturn(0); 899 } 900 901 902 EXTERN_C_BEGIN 903 #undef __FUNCT__ 904 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE" 905 PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 906 { 907 PetscErrorCode ierr; 908 GPUStorageFormat format = CSR; 909 910 PetscFunctionBegin; 911 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 912 if (B->factortype==MAT_FACTOR_NONE) { 913 /* you cannot check the inode.use flag here since the matrix was just created.*/ 914 // now build a GPU matrix data structure 915 B->spptr = new Mat_SeqAIJCUSPARSE; 916 ((Mat_SeqAIJCUSPARSE *)B->spptr)->mat = 0; 917 ((Mat_SeqAIJCUSPARSE *)B->spptr)->tempvec = 0; 918 ((Mat_SeqAIJCUSPARSE *)B->spptr)->format = format; 919 } else { 920 /* NEXT, set the pointers to the triangular factors */ 921 ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->loTriFactorPtr = 0; 922 ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->upTriFactorPtr = 0; 923 ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->tempvec = 0; 924 ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->format = format; 925 } 926 // Create a single instance of the MAT_cusparseHandle for any matrix (matMult, TriSolve, ...) 927 if (!MAT_cusparseHandle) { 928 cusparseStatus_t stat; 929 stat = cusparseCreate(&MAT_cusparseHandle);CHKERRCUSP(stat); 930 } 931 // Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the 932 // default cusparse tri solve. Note the difference with the implementation in 933 // MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu 934 ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatGetFactor_petsc_C","MatGetFactor_seqaij_cusparse",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr); 935 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 936 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 937 B->ops->getvecs = MatGetVecs_SeqAIJCUSPARSE; 938 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 939 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 940 B->ops->mult = MatMult_SeqAIJCUSPARSE; 941 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 942 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 943 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 944 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 945 B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 946 PetscFunctionReturn(0); 947 } 948 EXTERN_C_END 949 950