1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format. 4 */ 5 6 #include "petscconf.h" 7 PETSC_CUDA_EXTERN_C_BEGIN 8 #include "../src/mat/impls/aij/seq/aij.h" /*I "petscmat.h" I*/ 9 //#include "petscbt.h" 10 #include "../src/vec/vec/impls/dvecimpl.h" 11 #include "petsc-private/vecimpl.h" 12 PETSC_CUDA_EXTERN_C_END 13 #undef VecType 14 #include "cusparsematimpl.h" 15 16 // this is such a hack ... but I don't know of another way to pass this variable 17 // from one GPU_Matrix_Ifc class to another. This is necessary for the parallel 18 // SpMV. Essentially, I need to use the same stream variable in two different 19 // data structures. I do this by creating a single instance of that stream 20 // and reuse it. 21 cudaStream_t theBodyStream=0; 22 23 EXTERN_C_BEGIN 24 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 25 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 26 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo *); 27 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 28 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 29 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat); 30 PetscErrorCode MatCUSPARSEAnalysisAndCopyToGPU(Mat); 31 EXTERN_C_END 32 33 EXTERN_C_BEGIN 34 #undef __FUNCT__ 35 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse" 36 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type) 37 { 38 PetscFunctionBegin; 39 *type = MATSOLVERCUSPARSE; 40 PetscFunctionReturn(0); 41 } 42 EXTERN_C_END 43 44 EXTERN_C_BEGIN 45 extern PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*); 46 EXTERN_C_END 47 48 49 50 /*MC 51 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers (ILU) for seq matrices 52 on the GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp 53 54 ./configure --download-txpetscgpu to install PETSc to use CUSPARSE solves 55 56 Options Database Keys: 57 + -mat_mult_cusparse_storage_format <CSR> - (choose one of) CSR, ELL, HYB 58 + -mat_solve_cusparse_storage_format <CSR> - (choose one of) CSR, ELL, HYB 59 60 Level: beginner 61 M*/ 62 63 EXTERN_C_BEGIN 64 #undef __FUNCT__ 65 #define __FUNCT__ "MatGetFactor_seqaij_cusparse" 66 PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B) 67 { 68 PetscErrorCode ierr; 69 70 PetscFunctionBegin; 71 ierr = MatGetFactor_seqaij_petsc(A,ftype,B);CHKERRQ(ierr); 72 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT){ 73 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 74 ierr = MatSetFromOptions_SeqAIJCUSPARSE(*B);CHKERRQ(ierr); 75 ierr = PetscObjectComposeFunctionDynamic((PetscObject)(*B),"MatFactorGetSolverPackage_C","MatFactorGetSolverPackage_seqaij_cusparse",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr); 76 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 77 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 78 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 79 (*B)->factortype = ftype; 80 PetscFunctionReturn(0); 81 } 82 EXTERN_C_END 83 84 #undef __FUNCT__ 85 #define __FUNCT__ "MatAIJCUSPARSESetGPUStorageFormatForMatSolve" 86 PetscErrorCode MatAIJCUSPARSESetGPUStorageFormatForMatSolve(Mat A,GPUStorageFormat format) 87 { 88 PetscErrorCode ierr; 89 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 90 GPU_Matrix_Ifc* cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 91 GPU_Matrix_Ifc* cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 92 PetscFunctionBegin; 93 cusparseTriFactors->format = format; 94 if (cusparseMatLo && cusparseMatUp) { 95 // If data exists already delete 96 if (cusparseMatLo) 97 delete (GPU_Matrix_Ifc *)cusparseMatLo; 98 if (cusparseMatUp) 99 delete (GPU_Matrix_Ifc *)cusparseMatUp; 100 101 // key data was deleted so reset the flag. 102 A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 103 104 // rebuild matrix 105 ierr=MatCUSPARSEAnalysisAndCopyToGPU(A);CHKERRQ(ierr); 106 } 107 PetscFunctionReturn(0); 108 } 109 110 //EXTERN_C_BEGIN 111 #undef __FUNCT__ 112 #define __FUNCT__ "MatSetOption_SeqAIJCUSPARSE" 113 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 114 { 115 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 116 PetscErrorCode ierr; 117 PetscFunctionBegin; 118 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 119 switch (op) { 120 case DIAGBLOCK_MAT_CSR: 121 case OFFDIAGBLOCK_MAT_CSR: 122 case MAT_CSR: 123 //std::cout << "MatSetOption_SeqAIJCUSPARSE : CSR" << std::endl; 124 cusparseMat->format = CSR; 125 break; 126 case DIAGBLOCK_MAT_DIA: 127 case OFFDIAGBLOCK_MAT_DIA: 128 case MAT_DIA: 129 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unsupported GPU matrix storage format DIA for (MPI,SEQ)AIJCUSPARSE matrix type."); 130 case DIAGBLOCK_MAT_ELL: 131 case OFFDIAGBLOCK_MAT_ELL: 132 case MAT_ELL: 133 //std::cout << "MatSetOption_SeqAIJCUSPARSE : ELL" << std::endl; 134 cusparseMat->format = ELL; 135 break; 136 case DIAGBLOCK_MAT_HYB: 137 case OFFDIAGBLOCK_MAT_HYB: 138 case MAT_HYB: 139 //std::cout << "MatSetOption_SeqAIJCUSPARSE : HYB" << std::endl; 140 cusparseMat->format = HYB; 141 break; 142 default: 143 break; 144 } 145 PetscFunctionReturn(0); 146 } 147 //EXTERN_C_END 148 149 //EXTERN_C_BEGIN 150 #undef __FUNCT__ 151 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE" 152 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A) 153 { 154 PetscErrorCode ierr; 155 PetscInt idx=0; 156 char * formats[] ={CSR,ELL,HYB}; 157 MatOption format; 158 PetscBool flg; 159 PetscFunctionBegin; 160 ierr = PetscOptionsBegin(((PetscObject)A)->comm,((PetscObject)A)->prefix,"When using TxPETSCGPU, AIJCUSPARSE Options","Mat");CHKERRQ(ierr); 161 if (A->factortype==MAT_FACTOR_NONE) { 162 ierr = PetscOptionsEList("-mat_mult_cusparse_storage_format", 163 "Set the storage format of (seq)aijcusparse gpu matrices for SpMV", 164 "None",formats,3,formats[0],&idx,&flg);CHKERRQ(ierr); 165 if (formats[idx] == CSR) 166 format=MAT_CSR; 167 else if (formats[idx] == ELL) 168 format=MAT_ELL; 169 else if (formats[idx] == HYB) 170 format=MAT_HYB; 171 172 ierr=MatSetOption_SeqAIJCUSPARSE(A,format,PETSC_TRUE);CHKERRQ(ierr); 173 } 174 else { 175 ierr = PetscOptionsEList("-mat_solve_cusparse_storage_format", 176 "Set the storage format of (seq)aijcusparse gpu matrices for TriSolve", 177 "None",formats,3,formats[0],&idx,&flg);CHKERRQ(ierr); 178 ierr=MatAIJCUSPARSESetGPUStorageFormatForMatSolve(A,formats[idx]);CHKERRQ(ierr); 179 } 180 ierr = PetscOptionsEnd();CHKERRQ(ierr); 181 PetscFunctionReturn(0); 182 183 } 184 //EXTERN_C_END 185 186 #undef __FUNCT__ 187 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE" 188 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 189 { 190 PetscErrorCode ierr; 191 192 PetscFunctionBegin; 193 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 194 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 195 PetscFunctionReturn(0); 196 } 197 198 #undef __FUNCT__ 199 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE" 200 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 201 { 202 PetscErrorCode ierr; 203 204 PetscFunctionBegin; 205 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 206 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 207 PetscFunctionReturn(0); 208 } 209 210 #undef __FUNCT__ 211 #define __FUNCT__ "MatCUSPARSEBuildLowerTriMatrix" 212 PetscErrorCode MatCUSPARSEBuildLowerTriMatrix(Mat A) 213 { 214 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 215 PetscInt n = A->rmap->n; 216 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 217 GPU_Matrix_Ifc* cusparseMat = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 218 cusparseStatus_t stat; 219 const PetscInt *ai = a->i,*aj = a->j,*vi; 220 const MatScalar *aa = a->a,*v; 221 PetscErrorCode ierr; 222 PetscInt *AiLo, *AjLo; 223 PetscScalar *AALo; 224 PetscInt i,nz, nzLower, offset, rowOffset; 225 226 PetscFunctionBegin; 227 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){ 228 try { 229 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 230 nzLower=n+ai[n]-ai[1]; 231 232 /* Allocate Space for the lower triangular matrix */ 233 ierr = cudaMallocHost((void **) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 234 ierr = cudaMallocHost((void **) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr); 235 ierr = cudaMallocHost((void **) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr); 236 237 /* Fill the lower triangular matrix */ 238 AiLo[0]=(PetscInt) 0; 239 AiLo[n]=nzLower; 240 AjLo[0]=(PetscInt) 0; 241 AALo[0]=(MatScalar) 1.0; 242 v = aa; 243 vi = aj; 244 offset=1; 245 rowOffset=1; 246 for (i=1; i<n; i++) { 247 nz = ai[i+1] - ai[i]; 248 // additional 1 for the term on the diagonal 249 AiLo[i]=rowOffset; 250 rowOffset+=nz+1; 251 252 ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 253 ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 254 255 offset+=nz; 256 AjLo[offset]=(PetscInt) i; 257 AALo[offset]=(MatScalar) 1.0; 258 offset+=1; 259 260 v += nz; 261 vi += nz; 262 } 263 cusparseMat = GPU_Matrix_Factory::getNew(cusparseTriFactors->format); 264 stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat); 265 ierr = cusparseMat->setMatrix(n, n, nzLower, AiLo, AjLo, AALo);CHKERRCUSP(ierr); 266 stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat); 267 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMat; 268 // free temporary data 269 ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr); 270 ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr); 271 ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 272 } catch(char* ex) { 273 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 274 } 275 } 276 PetscFunctionReturn(0); 277 } 278 279 #undef __FUNCT__ 280 #define __FUNCT__ "MatCUSPARSEBuildUpperTriMatrix" 281 PetscErrorCode MatCUSPARSEBuildUpperTriMatrix(Mat A) 282 { 283 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 284 PetscInt n = A->rmap->n; 285 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 286 GPU_Matrix_Ifc* cusparseMat = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 287 cusparseStatus_t stat; 288 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 289 const MatScalar *aa = a->a,*v; 290 PetscInt *AiUp, *AjUp; 291 PetscScalar *AAUp; 292 PetscInt i,nz, nzUpper, offset; 293 PetscErrorCode ierr; 294 295 PetscFunctionBegin; 296 297 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){ 298 try { 299 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 300 nzUpper = adiag[0]-adiag[n]; 301 302 /* Allocate Space for the upper triangular matrix */ 303 ierr = cudaMallocHost((void **) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 304 ierr = cudaMallocHost((void **) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 305 ierr = cudaMallocHost((void **) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 306 307 /* Fill the upper triangular matrix */ 308 AiUp[0]=(PetscInt) 0; 309 AiUp[n]=nzUpper; 310 offset = nzUpper; 311 for (i=n-1; i>=0; i--){ 312 v = aa + adiag[i+1] + 1; 313 vi = aj + adiag[i+1] + 1; 314 315 // number of elements NOT on the diagonal 316 nz = adiag[i] - adiag[i+1]-1; 317 318 // decrement the offset 319 offset -= (nz+1); 320 321 // first, set the diagonal elements 322 AjUp[offset] = (PetscInt) i; 323 AAUp[offset] = 1./v[nz]; 324 AiUp[i] = AiUp[i+1] - (nz+1); 325 326 ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 327 ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 328 } 329 cusparseMat = GPU_Matrix_Factory::getNew(cusparseTriFactors->format); 330 stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 331 ierr = cusparseMat->setMatrix(n, n, nzUpper, AiUp, AjUp, AAUp);CHKERRCUSP(ierr); 332 stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat); 333 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMat; 334 // free temporary data 335 ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 336 ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 337 ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 338 } catch(char* ex) { 339 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 340 } 341 } 342 PetscFunctionReturn(0); 343 } 344 345 #undef __FUNCT__ 346 #define __FUNCT__ "MatCUSPARSEAnalysiAndCopyToGPU" 347 PetscErrorCode MatCUSPARSEAnalysisAndCopyToGPU(Mat A) 348 { 349 PetscErrorCode ierr; 350 Mat_SeqAIJ *a=(Mat_SeqAIJ *)A->data; 351 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 352 IS isrow = a->row,iscol = a->icol; 353 PetscBool row_identity,col_identity; 354 const PetscInt *r,*c; 355 PetscInt n = A->rmap->n; 356 357 PetscFunctionBegin; 358 ierr = MatCUSPARSEBuildLowerTriMatrix(A);CHKERRQ(ierr); 359 ierr = MatCUSPARSEBuildUpperTriMatrix(A);CHKERRQ(ierr); 360 cusparseTriFactors->tempvec = new CUSPARRAY; 361 cusparseTriFactors->tempvec->resize(n); 362 363 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 364 //lower triangular indices 365 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 366 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 367 if (!row_identity) 368 ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(r, n);CHKERRCUSP(ierr); 369 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 370 371 //upper triangular indices 372 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 373 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 374 if (!col_identity) 375 ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(c, n);CHKERRCUSP(ierr); 376 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 377 PetscFunctionReturn(0); 378 } 379 380 #undef __FUNCT__ 381 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE" 382 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 383 { 384 PetscErrorCode ierr; 385 Mat_SeqAIJ *b=(Mat_SeqAIJ *)B->data; 386 IS isrow = b->row,iscol = b->col; 387 PetscBool row_identity,col_identity; 388 389 PetscFunctionBegin; 390 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 391 // determine which version of MatSolve needs to be used. 392 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 393 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 394 if (row_identity && col_identity) B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 395 else B->ops->solve = MatSolve_SeqAIJCUSPARSE; 396 //if (!row_identity) printf("Row permutations exist!"); 397 //if (!col_identity) printf("Col permutations exist!"); 398 399 // get the triangular factors 400 ierr = MatCUSPARSEAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 401 PetscFunctionReturn(0); 402 } 403 404 405 406 #undef __FUNCT__ 407 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE" 408 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 409 { 410 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 411 PetscErrorCode ierr; 412 CUSPARRAY *xGPU, *bGPU; 413 cusparseStatus_t stat; 414 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 415 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 416 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 417 CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec; 418 419 PetscFunctionBegin; 420 // Get the GPU pointers 421 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 422 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 423 424 // solve with reordering 425 ierr = cusparseMatLo->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr); 426 stat = cusparseMatLo->solve(xGPU, tempGPU);CHKERRCUSP(stat); 427 stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat); 428 ierr = cusparseMatUp->reorderOut(xGPU);CHKERRCUSP(ierr); 429 430 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 431 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 432 ierr = WaitForGPU();CHKERRCUSP(ierr); 433 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 434 PetscFunctionReturn(0); 435 } 436 437 438 439 440 #undef __FUNCT__ 441 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering" 442 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 443 { 444 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 445 PetscErrorCode ierr; 446 CUSPARRAY *xGPU, *bGPU; 447 cusparseStatus_t stat; 448 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 449 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 450 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 451 CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec; 452 453 PetscFunctionBegin; 454 // Get the GPU pointers 455 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 456 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 457 458 // solve 459 stat = cusparseMatLo->solve(bGPU, tempGPU);CHKERRCUSP(stat); 460 stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat); 461 462 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 463 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 464 ierr = WaitForGPU();CHKERRCUSP(ierr); 465 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 466 PetscFunctionReturn(0); 467 } 468 469 #undef __FUNCT__ 470 #define __FUNCT__ "MatCUSPARSECopyToGPU" 471 PetscErrorCode MatCUSPARSECopyToGPU(Mat A) 472 { 473 474 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 475 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 476 PetscInt m = A->rmap->n,*ii,*ridx; 477 PetscErrorCode ierr; 478 479 480 PetscFunctionBegin; 481 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){ 482 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 483 /* 484 It may be possible to reuse nonzero structure with new matrix values but 485 for simplicity and insured correctness we delete and build a new matrix on 486 the GPU. Likely a very small performance hit. 487 */ 488 if (cusparseMat->mat){ 489 try { 490 delete cusparseMat->mat; 491 if (cusparseMat->tempvec) 492 delete cusparseMat->tempvec; 493 494 } catch(char* ex) { 495 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 496 } 497 } 498 try { 499 cusparseMat->nonzerorow=0; 500 for (int j = 0; j<m; j++) 501 cusparseMat->nonzerorow += ((a->i[j+1]-a->i[j])>0); 502 503 if (a->compressedrow.use) { 504 m = a->compressedrow.nrows; 505 ii = a->compressedrow.i; 506 ridx = a->compressedrow.rindex; 507 } else { 508 // Forcing compressed row on the GPU ... only relevant for CSR storage 509 int k=0; 510 ierr = PetscMalloc((cusparseMat->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr); 511 ierr = PetscMalloc((cusparseMat->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr); 512 ii[0]=0; 513 for (int j = 0; j<m; j++) { 514 if ((a->i[j+1]-a->i[j])>0) { 515 ii[k] = a->i[j]; 516 ridx[k]= j; 517 k++; 518 } 519 } 520 ii[cusparseMat->nonzerorow] = a->nz; 521 m = cusparseMat->nonzerorow; 522 } 523 524 // Build our matrix ... first determine the GPU storage type 525 cusparseMat->mat = GPU_Matrix_Factory::getNew(cusparseMat->format); 526 527 // Create the streams and events (if desired). 528 PetscMPIInt size; 529 ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr); 530 ierr = cusparseMat->mat->buildStreamsAndEvents(size, &theBodyStream);CHKERRCUSP(ierr); 531 532 // FILL MODE UPPER is irrelevant 533 cusparseStatus_t stat = cusparseMat->mat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 534 535 // lastly, build the matrix 536 ierr = cusparseMat->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);CHKERRCUSP(ierr); 537 cusparseMat->mat->setCPRowIndices(ridx, m); 538 // } 539 if (!a->compressedrow.use) { 540 // free data 541 ierr = PetscFree(ii);CHKERRQ(ierr); 542 ierr = PetscFree(ridx);CHKERRQ(ierr); 543 } 544 cusparseMat->tempvec = new CUSPARRAY; 545 cusparseMat->tempvec->resize(m); 546 //A->spptr = cusparseMat; 547 } catch(char* ex) { 548 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 549 } 550 ierr = WaitForGPU();CHKERRCUSP(ierr); 551 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 552 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 553 } 554 PetscFunctionReturn(0); 555 } 556 557 // #undef __FUNCT__ 558 // #define __FUNCT__ "MatCUSPARSECopyFromGPU" 559 // PetscErrorCode MatCUSPARSECopyFromGPU(Mat A, CUSPMATRIX *Agpu) 560 // { 561 // Mat_SeqAIJCUSPARSE *cuspstruct = (Mat_SeqAIJCUSPARSE *) A->spptr; 562 // Mat_SeqAIJ *a = (Mat_SeqAIJ *) A->data; 563 // PetscInt m = A->rmap->n; 564 // PetscErrorCode ierr; 565 566 // PetscFunctionBegin; 567 // if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED) { 568 // if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED) { 569 // try { 570 // cuspstruct->mat = Agpu; 571 // if (a->compressedrow.use) { 572 // //PetscInt *ii, *ridx; 573 // SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Cannot handle row compression for GPU matrices"); 574 // } else { 575 // PetscInt i; 576 577 // if (m+1 != (PetscInt) cuspstruct->mat->row_offsets.size()) {SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", cuspstruct->mat->row_offsets.size()-1, m);} 578 // a->nz = cuspstruct->mat->values.size(); 579 // a->maxnz = a->nz; /* Since we allocate exactly the right amount */ 580 // A->preallocated = PETSC_TRUE; 581 // // Copy ai, aj, aa 582 // if (a->singlemalloc) { 583 // if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);} 584 // } else { 585 // if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);} 586 // if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);} 587 // if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);} 588 // } 589 // ierr = PetscMalloc3(a->nz,PetscScalar,&a->a,a->nz,PetscInt,&a->j,m+1,PetscInt,&a->i);CHKERRQ(ierr); 590 // ierr = PetscLogObjectMemory(A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr); 591 // a->singlemalloc = PETSC_TRUE; 592 // thrust::copy(cuspstruct->mat->row_offsets.begin(), cuspstruct->mat->row_offsets.end(), a->i); 593 // thrust::copy(cuspstruct->mat->column_indices.begin(), cuspstruct->mat->column_indices.end(), a->j); 594 // thrust::copy(cuspstruct->mat->values.begin(), cuspstruct->mat->values.end(), a->a); 595 // // Setup row lengths 596 // if (a->imax) {ierr = PetscFree2(a->imax,a->ilen);CHKERRQ(ierr);} 597 // ierr = PetscMalloc2(m,PetscInt,&a->imax,m,PetscInt,&a->ilen);CHKERRQ(ierr); 598 // ierr = PetscLogObjectMemory(A, 2*m*sizeof(PetscInt));CHKERRQ(ierr); 599 // for(i = 0; i < m; ++i) { 600 // a->imax[i] = a->ilen[i] = a->i[i+1] - a->i[i]; 601 // } 602 // // a->diag? 603 // } 604 // cuspstruct->tempvec = new CUSPARRAY; 605 // cuspstruct->tempvec->resize(m); 606 // } catch(char *ex) { 607 // SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSP error: %s", ex); 608 // } 609 // } 610 // // This assembly prevents resetting the flag to PETSC_CUSP_CPU and recopying 611 // ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 612 // ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 613 // A->valid_GPU_matrix = PETSC_CUSP_BOTH; 614 // } else { 615 // SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Only valid for unallocated GPU matrices"); 616 // } 617 // PetscFunctionReturn(0); 618 // } 619 620 #undef __FUNCT__ 621 #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE" 622 PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left) 623 { 624 PetscErrorCode ierr; 625 626 PetscFunctionBegin; 627 628 if (right) { 629 ierr = VecCreate(((PetscObject)mat)->comm,right);CHKERRQ(ierr); 630 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 631 ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr); 632 ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr); 633 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 634 } 635 if (left) { 636 ierr = VecCreate(((PetscObject)mat)->comm,left);CHKERRQ(ierr); 637 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 638 ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr); 639 ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr); 640 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 641 } 642 PetscFunctionReturn(0); 643 } 644 645 #undef __FUNCT__ 646 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE" 647 PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 648 { 649 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 650 PetscErrorCode ierr; 651 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr; 652 CUSPARRAY *xarray,*yarray; 653 654 PetscFunctionBegin; 655 // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 656 // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr); 657 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 658 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 659 try { 660 ierr = cusparseMat->mat->multiply(xarray, yarray);CHKERRCUSP(ierr); 661 } catch (char* ex) { 662 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 663 } 664 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 665 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 666 if (!cusparseMat->mat->hasNonZeroStream()) { 667 ierr = WaitForGPU();CHKERRCUSP(ierr); 668 } 669 ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr); 670 PetscFunctionReturn(0); 671 } 672 673 674 #undef __FUNCT__ 675 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE" 676 PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 677 { 678 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 679 PetscErrorCode ierr; 680 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr; 681 CUSPARRAY *xarray,*yarray; 682 683 PetscFunctionBegin; 684 // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 685 // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr); 686 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 687 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 688 try { 689 #if !defined(PETSC_USE_COMPLEX) 690 ierr = cusparseMat->mat->multiply(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr); 691 #else 692 ierr = cusparseMat->mat->multiply(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr); 693 #endif 694 } catch (char* ex) { 695 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 696 } 697 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 698 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 699 if (!cusparseMat->mat->hasNonZeroStream()) { 700 ierr = WaitForGPU();CHKERRCUSP(ierr); 701 } 702 ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr); 703 PetscFunctionReturn(0); 704 } 705 706 #undef __FUNCT__ 707 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 708 PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 709 { 710 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 711 PetscErrorCode ierr; 712 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr; 713 CUSPARRAY *xarray,*yarray,*zarray; 714 PetscFunctionBegin; 715 // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 716 // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr); 717 try { 718 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 719 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 720 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 721 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 722 723 // multiply add 724 ierr = cusparseMat->mat->multiplyAdd(xarray, zarray);CHKERRCUSP(ierr); 725 726 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 727 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 728 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 729 730 } catch(char* ex) { 731 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 732 } 733 ierr = WaitForGPU();CHKERRCUSP(ierr); 734 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 735 PetscFunctionReturn(0); 736 } 737 738 #undef __FUNCT__ 739 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 740 PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 741 { 742 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 743 PetscErrorCode ierr; 744 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr; 745 CUSPARRAY *xarray,*yarray,*zarray; 746 PetscFunctionBegin; 747 // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 748 // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr); 749 try { 750 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 751 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 752 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 753 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 754 755 // multiply add with matrix transpose 756 #if !defined(PETSC_USE_COMPLEX) 757 ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr); 758 #else 759 ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr); 760 #endif 761 762 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 763 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 764 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 765 766 } catch(char* ex) { 767 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 768 } 769 ierr = WaitForGPU();CHKERRCUSP(ierr); 770 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 771 PetscFunctionReturn(0); 772 } 773 774 #undef __FUNCT__ 775 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE" 776 PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 777 { 778 PetscErrorCode ierr; 779 PetscFunctionBegin; 780 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 781 ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr); 782 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 783 // this is not necessary since MatCUSPARSECopyToGPU has been called. 784 //if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED){ 785 // A->valid_GPU_matrix = PETSC_CUSP_CPU; 786 //} 787 PetscFunctionReturn(0); 788 } 789 790 /* --------------------------------------------------------------------------------*/ 791 #undef __FUNCT__ 792 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE" 793 /*@C 794 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 795 (the default parallel PETSc format). For good matrix assembly performance 796 the user should preallocate the matrix storage by setting the parameter nz 797 (or the array nnz). By setting these parameters accurately, performance 798 during matrix assembly can be increased by more than a factor of 50. 799 800 Collective on MPI_Comm 801 802 Input Parameters: 803 + comm - MPI communicator, set to PETSC_COMM_SELF 804 . m - number of rows 805 . n - number of columns 806 . nz - number of nonzeros per row (same for all rows) 807 - nnz - array containing the number of nonzeros in the various rows 808 (possibly different for each row) or PETSC_NULL 809 810 Output Parameter: 811 . A - the matrix 812 813 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 814 MatXXXXSetPreallocation() paradgm instead of this routine directly. 815 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 816 817 Notes: 818 If nnz is given then nz is ignored 819 820 The AIJ format (also called the Yale sparse matrix format or 821 compressed row storage), is fully compatible with standard Fortran 77 822 storage. That is, the stored row and column indices can begin at 823 either one (as in Fortran) or zero. See the users' manual for details. 824 825 Specify the preallocated storage with either nz or nnz (not both). 826 Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory 827 allocation. For large problems you MUST preallocate memory or you 828 will get TERRIBLE performance, see the users' manual chapter on matrices. 829 830 By default, this format uses inodes (identical nodes) when possible, to 831 improve numerical efficiency of matrix-vector products and solves. We 832 search for consecutive rows with the same nonzero structure, thereby 833 reusing matrix information to achieve increased efficiency. 834 835 Level: intermediate 836 837 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 838 839 @*/ 840 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 841 { 842 PetscErrorCode ierr; 843 844 PetscFunctionBegin; 845 ierr = MatCreate(comm,A);CHKERRQ(ierr); 846 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 847 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 848 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 849 PetscFunctionReturn(0); 850 } 851 852 #undef __FUNCT__ 853 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE" 854 PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 855 { 856 PetscErrorCode ierr; 857 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 858 859 PetscFunctionBegin; 860 if (A->factortype==MAT_FACTOR_NONE) { 861 // The regular matrices 862 try { 863 if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED){ 864 delete (GPU_Matrix_Ifc *)(cusparseMat->mat); 865 } 866 if (cusparseMat->tempvec!=0) 867 delete cusparseMat->tempvec; 868 delete cusparseMat; 869 A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 870 } catch(char* ex) { 871 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 872 } 873 } else { 874 // The triangular factors 875 try { 876 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 877 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 878 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 879 // destroy the matrix and the container 880 delete (GPU_Matrix_Ifc *)cusparseMatLo; 881 delete (GPU_Matrix_Ifc *)cusparseMatUp; 882 delete (CUSPARRAY*) cusparseTriFactors->tempvec; 883 delete cusparseTriFactors; 884 } catch(char* ex) { 885 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 886 } 887 } 888 if (MAT_cusparseHandle) { 889 cusparseStatus_t stat; 890 stat = cusparseDestroy(MAT_cusparseHandle);CHKERRCUSP(stat); 891 MAT_cusparseHandle=0; 892 } 893 /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */ 894 A->spptr = 0; 895 896 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 897 PetscFunctionReturn(0); 898 } 899 900 901 EXTERN_C_BEGIN 902 #undef __FUNCT__ 903 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE" 904 PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 905 { 906 PetscErrorCode ierr; 907 GPUStorageFormat format = CSR; 908 909 PetscFunctionBegin; 910 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 911 if (B->factortype==MAT_FACTOR_NONE) { 912 /* you cannot check the inode.use flag here since the matrix was just created.*/ 913 // now build a GPU matrix data structure 914 B->spptr = new Mat_SeqAIJCUSPARSE; 915 ((Mat_SeqAIJCUSPARSE *)B->spptr)->mat = 0; 916 ((Mat_SeqAIJCUSPARSE *)B->spptr)->tempvec = 0; 917 ((Mat_SeqAIJCUSPARSE *)B->spptr)->format = format; 918 } else { 919 /* NEXT, set the pointers to the triangular factors */ 920 ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->loTriFactorPtr = 0; 921 ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->upTriFactorPtr = 0; 922 ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->tempvec = 0; 923 ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->format = format; 924 } 925 // Create a single instance of the MAT_cusparseHandle for any matrix (matMult, TriSolve, ...) 926 if (!MAT_cusparseHandle) { 927 cusparseStatus_t stat; 928 stat = cusparseCreate(&MAT_cusparseHandle);CHKERRCUSP(stat); 929 } 930 // Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the 931 // default cusparse tri solve. Note the difference with the implementation in 932 // MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu 933 ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatGetFactor_petsc_C","MatGetFactor_seqaij_cusparse",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr); 934 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 935 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 936 B->ops->getvecs = MatGetVecs_SeqAIJCUSPARSE; 937 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 938 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 939 B->ops->mult = MatMult_SeqAIJCUSPARSE; 940 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 941 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 942 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 943 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 944 B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 945 PetscFunctionReturn(0); 946 } 947 EXTERN_C_END 948 949