1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format. 4 */ 5 6 #include "petscconf.h" 7 PETSC_CUDA_EXTERN_C_BEGIN 8 #include "../src/mat/impls/aij/seq/aij.h" /*I "petscmat.h" I*/ 9 //#include "petscbt.h" 10 #include "../src/vec/vec/impls/dvecimpl.h" 11 #include "petsc-private/vecimpl.h" 12 PETSC_CUDA_EXTERN_C_END 13 #undef VecType 14 #include "cusparsematimpl.h" 15 16 // this is such a hack ... but I don't know of another way to pass this variable 17 // from one GPU_Matrix_Ifc class to another. This is necessary for the parallel 18 // SpMV. Essentially, I need to use the same stream variable in two different 19 // data structures. I do this by creating a single instance of that stream 20 // and reuse it. 21 cudaStream_t theBodyStream=0; 22 23 EXTERN_C_BEGIN 24 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 25 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 26 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo *); 27 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 28 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 29 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat); 30 PetscErrorCode MatCUSPARSEAnalysisAndCopyToGPU(Mat); 31 PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 32 PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 33 PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 34 PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 35 EXTERN_C_END 36 37 EXTERN_C_BEGIN 38 #undef __FUNCT__ 39 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse" 40 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type) 41 { 42 PetscFunctionBegin; 43 *type = MATSOLVERCUSPARSE; 44 PetscFunctionReturn(0); 45 } 46 EXTERN_C_END 47 48 EXTERN_C_BEGIN 49 extern PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*); 50 EXTERN_C_END 51 52 53 54 /*MC 55 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers (ILU) for seq matrices 56 on the GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp 57 5~ 58 ./configure --download-txpetscgpu to install PETSc to use CUSPARSE solves 59 60 Options Database Keys: 61 + -mat_mult_cusparse_storage_format <CSR> - (choose one of) CSR, ELL, HYB 62 + -mat_solve_cusparse_storage_format <CSR> - (choose one of) CSR, ELL, HYB 63 64 Level: beginner 65 M*/ 66 67 EXTERN_C_BEGIN 68 #undef __FUNCT__ 69 #define __FUNCT__ "MatGetFactor_seqaij_cusparse" 70 PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B) 71 { 72 PetscErrorCode ierr; 73 74 PetscFunctionBegin; 75 ierr = MatGetFactor_seqaij_petsc(A,ftype,B);CHKERRQ(ierr); 76 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT){ 77 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 78 ierr = MatSetFromOptions_SeqAIJCUSPARSE(*B);CHKERRQ(ierr); 79 ierr = PetscObjectComposeFunctionDynamic((PetscObject)(*B),"MatFactorGetSolverPackage_C","MatFactorGetSolverPackage_seqaij_cusparse",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr); 80 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 81 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 82 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 83 (*B)->factortype = ftype; 84 PetscFunctionReturn(0); 85 } 86 EXTERN_C_END 87 88 #undef __FUNCT__ 89 #define __FUNCT__ "MatAIJCUSPARSESetGPUStorageFormatForMatSolve" 90 PetscErrorCode MatAIJCUSPARSESetGPUStorageFormatForMatSolve(Mat A,GPUStorageFormat format) 91 { 92 PetscErrorCode ierr; 93 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 94 GPU_Matrix_Ifc* cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 95 GPU_Matrix_Ifc* cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 96 PetscFunctionBegin; 97 cusparseTriFactors->format = format; 98 if (cusparseMatLo && cusparseMatUp) { 99 // If data exists already delete 100 if (cusparseMatLo) 101 delete (GPU_Matrix_Ifc *)cusparseMatLo; 102 if (cusparseMatUp) 103 delete (GPU_Matrix_Ifc *)cusparseMatUp; 104 105 // key data was deleted so reset the flag. 106 A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 107 108 // rebuild matrix 109 ierr=MatCUSPARSEAnalysisAndCopyToGPU(A);CHKERRQ(ierr); 110 } 111 PetscFunctionReturn(0); 112 } 113 114 //EXTERN_C_BEGIN 115 #undef __FUNCT__ 116 #define __FUNCT__ "MatSetOption_SeqAIJCUSPARSE" 117 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 118 { 119 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 120 PetscErrorCode ierr; 121 PetscFunctionBegin; 122 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 123 switch (op) { 124 case MAT_DIAGBLOCK_CSR: 125 case MAT_OFFDIAGBLOCK_CSR: 126 case MAT_CSR: 127 cusparseMat->format = CSR; 128 break; 129 case MAT_DIAGBLOCK_DIA: 130 case MAT_OFFDIAGBLOCK_DIA: 131 case MAT_DIA: 132 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unsupported GPU matrix storage format DIA for (MPI,SEQ)AIJCUSPARSE matrix type."); 133 case MAT_DIAGBLOCK_ELL: 134 case MAT_OFFDIAGBLOCK_ELL: 135 case MAT_ELL: 136 cusparseMat->format = ELL; 137 break; 138 case MAT_DIAGBLOCK_HYB: 139 case MAT_OFFDIAGBLOCK_HYB: 140 case MAT_HYB: 141 cusparseMat->format = HYB; 142 break; 143 default: 144 break; 145 } 146 PetscFunctionReturn(0); 147 } 148 //EXTERN_C_END 149 150 //EXTERN_C_BEGIN 151 #undef __FUNCT__ 152 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE" 153 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A) 154 { 155 PetscErrorCode ierr; 156 PetscInt idx=0; 157 char * formats[] ={CSR,ELL,HYB}; 158 MatOption format; 159 PetscBool flg; 160 PetscFunctionBegin; 161 ierr = PetscOptionsBegin(((PetscObject)A)->comm,((PetscObject)A)->prefix,"When using TxPETSCGPU, AIJCUSPARSE Options","Mat");CHKERRQ(ierr); 162 if (A->factortype==MAT_FACTOR_NONE) { 163 ierr = PetscOptionsEList("-mat_mult_cusparse_storage_format", 164 "Set the storage format of (seq)aijcusparse gpu matrices for SpMV", 165 "None",formats,3,formats[0],&idx,&flg);CHKERRQ(ierr); 166 if (formats[idx] == CSR) 167 format=MAT_CSR; 168 else if (formats[idx] == ELL) 169 format=MAT_ELL; 170 else if (formats[idx] == HYB) 171 format=MAT_HYB; 172 173 ierr=MatSetOption_SeqAIJCUSPARSE(A,format,PETSC_TRUE);CHKERRQ(ierr); 174 } 175 else { 176 ierr = PetscOptionsEList("-mat_solve_cusparse_storage_format", 177 "Set the storage format of (seq)aijcusparse gpu matrices for TriSolve", 178 "None",formats,3,formats[0],&idx,&flg);CHKERRQ(ierr); 179 ierr=MatAIJCUSPARSESetGPUStorageFormatForMatSolve(A,formats[idx]);CHKERRQ(ierr); 180 } 181 ierr = PetscOptionsEnd();CHKERRQ(ierr); 182 PetscFunctionReturn(0); 183 184 } 185 //EXTERN_C_END 186 187 #undef __FUNCT__ 188 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE" 189 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 190 { 191 PetscErrorCode ierr; 192 193 PetscFunctionBegin; 194 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 195 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 196 PetscFunctionReturn(0); 197 } 198 199 #undef __FUNCT__ 200 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE" 201 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 202 { 203 PetscErrorCode ierr; 204 205 PetscFunctionBegin; 206 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 207 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 208 PetscFunctionReturn(0); 209 } 210 211 #undef __FUNCT__ 212 #define __FUNCT__ "MatCUSPARSEBuildLowerTriMatrix" 213 PetscErrorCode MatCUSPARSEBuildLowerTriMatrix(Mat A) 214 { 215 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 216 PetscInt n = A->rmap->n; 217 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 218 GPU_Matrix_Ifc* cusparseMat = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 219 cusparseStatus_t stat; 220 const PetscInt *ai = a->i,*aj = a->j,*vi; 221 const MatScalar *aa = a->a,*v; 222 PetscErrorCode ierr; 223 PetscInt *AiLo, *AjLo; 224 PetscScalar *AALo; 225 PetscInt i,nz, nzLower, offset, rowOffset; 226 227 PetscFunctionBegin; 228 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){ 229 try { 230 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 231 nzLower=n+ai[n]-ai[1]; 232 233 /* Allocate Space for the lower triangular matrix */ 234 ierr = cudaMallocHost((void **) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 235 ierr = cudaMallocHost((void **) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr); 236 ierr = cudaMallocHost((void **) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr); 237 238 /* Fill the lower triangular matrix */ 239 AiLo[0]=(PetscInt) 0; 240 AiLo[n]=nzLower; 241 AjLo[0]=(PetscInt) 0; 242 AALo[0]=(MatScalar) 1.0; 243 v = aa; 244 vi = aj; 245 offset=1; 246 rowOffset=1; 247 for (i=1; i<n; i++) { 248 nz = ai[i+1] - ai[i]; 249 // additional 1 for the term on the diagonal 250 AiLo[i]=rowOffset; 251 rowOffset+=nz+1; 252 253 ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 254 ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 255 256 offset+=nz; 257 AjLo[offset]=(PetscInt) i; 258 AALo[offset]=(MatScalar) 1.0; 259 offset+=1; 260 261 v += nz; 262 vi += nz; 263 } 264 cusparseMat = GPU_Matrix_Factory::getNew(cusparseTriFactors->format); 265 stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat); 266 ierr = cusparseMat->setMatrix(n, n, nzLower, AiLo, AjLo, AALo);CHKERRCUSP(ierr); 267 stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat); 268 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMat; 269 // free temporary data 270 ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr); 271 ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr); 272 ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 273 } catch(char* ex) { 274 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 275 } 276 } 277 PetscFunctionReturn(0); 278 } 279 280 #undef __FUNCT__ 281 #define __FUNCT__ "MatCUSPARSEBuildUpperTriMatrix" 282 PetscErrorCode MatCUSPARSEBuildUpperTriMatrix(Mat A) 283 { 284 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 285 PetscInt n = A->rmap->n; 286 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 287 GPU_Matrix_Ifc* cusparseMat = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 288 cusparseStatus_t stat; 289 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 290 const MatScalar *aa = a->a,*v; 291 PetscInt *AiUp, *AjUp; 292 PetscScalar *AAUp; 293 PetscInt i,nz, nzUpper, offset; 294 PetscErrorCode ierr; 295 296 PetscFunctionBegin; 297 298 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){ 299 try { 300 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 301 nzUpper = adiag[0]-adiag[n]; 302 303 /* Allocate Space for the upper triangular matrix */ 304 ierr = cudaMallocHost((void **) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 305 ierr = cudaMallocHost((void **) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 306 ierr = cudaMallocHost((void **) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 307 308 /* Fill the upper triangular matrix */ 309 AiUp[0]=(PetscInt) 0; 310 AiUp[n]=nzUpper; 311 offset = nzUpper; 312 for (i=n-1; i>=0; i--){ 313 v = aa + adiag[i+1] + 1; 314 vi = aj + adiag[i+1] + 1; 315 316 // number of elements NOT on the diagonal 317 nz = adiag[i] - adiag[i+1]-1; 318 319 // decrement the offset 320 offset -= (nz+1); 321 322 // first, set the diagonal elements 323 AjUp[offset] = (PetscInt) i; 324 AAUp[offset] = 1./v[nz]; 325 AiUp[i] = AiUp[i+1] - (nz+1); 326 327 ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 328 ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 329 } 330 cusparseMat = GPU_Matrix_Factory::getNew(cusparseTriFactors->format); 331 stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 332 ierr = cusparseMat->setMatrix(n, n, nzUpper, AiUp, AjUp, AAUp);CHKERRCUSP(ierr); 333 stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat); 334 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMat; 335 // free temporary data 336 ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 337 ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 338 ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 339 } catch(char* ex) { 340 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 341 } 342 } 343 PetscFunctionReturn(0); 344 } 345 346 #undef __FUNCT__ 347 #define __FUNCT__ "MatCUSPARSEAnalysiAndCopyToGPU" 348 PetscErrorCode MatCUSPARSEAnalysisAndCopyToGPU(Mat A) 349 { 350 PetscErrorCode ierr; 351 Mat_SeqAIJ *a=(Mat_SeqAIJ *)A->data; 352 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 353 IS isrow = a->row,iscol = a->icol; 354 PetscBool row_identity,col_identity; 355 const PetscInt *r,*c; 356 PetscInt n = A->rmap->n; 357 358 PetscFunctionBegin; 359 ierr = MatCUSPARSEBuildLowerTriMatrix(A);CHKERRQ(ierr); 360 ierr = MatCUSPARSEBuildUpperTriMatrix(A);CHKERRQ(ierr); 361 cusparseTriFactors->tempvec = new CUSPARRAY; 362 cusparseTriFactors->tempvec->resize(n); 363 364 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 365 //lower triangular indices 366 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 367 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 368 if (!row_identity) 369 ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(r, n);CHKERRCUSP(ierr); 370 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 371 372 //upper triangular indices 373 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 374 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 375 if (!col_identity) 376 ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(c, n);CHKERRCUSP(ierr); 377 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 378 PetscFunctionReturn(0); 379 } 380 381 #undef __FUNCT__ 382 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE" 383 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 384 { 385 PetscErrorCode ierr; 386 Mat_SeqAIJ *b=(Mat_SeqAIJ *)B->data; 387 IS isrow = b->row,iscol = b->col; 388 PetscBool row_identity,col_identity; 389 390 PetscFunctionBegin; 391 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 392 // determine which version of MatSolve needs to be used. 393 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 394 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 395 if (row_identity && col_identity) B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 396 else B->ops->solve = MatSolve_SeqAIJCUSPARSE; 397 398 //if (!row_identity) printf("Row permutations exist!"); 399 //if (!col_identity) printf("Col permutations exist!"); 400 401 // get the triangular factors 402 ierr = MatCUSPARSEAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 403 PetscFunctionReturn(0); 404 } 405 406 407 408 #undef __FUNCT__ 409 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE" 410 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 411 { 412 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 413 PetscErrorCode ierr; 414 CUSPARRAY *xGPU, *bGPU; 415 cusparseStatus_t stat; 416 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 417 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 418 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 419 CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec; 420 421 PetscFunctionBegin; 422 // Get the GPU pointers 423 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 424 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 425 426 // solve with reordering 427 ierr = cusparseMatLo->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr); 428 stat = cusparseMatLo->solve(xGPU, tempGPU);CHKERRCUSP(stat); 429 stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat); 430 ierr = cusparseMatUp->reorderOut(xGPU);CHKERRCUSP(ierr); 431 432 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 433 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 434 ierr = WaitForGPU();CHKERRCUSP(ierr); 435 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 436 PetscFunctionReturn(0); 437 } 438 439 440 441 442 #undef __FUNCT__ 443 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering" 444 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 445 { 446 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 447 PetscErrorCode ierr; 448 CUSPARRAY *xGPU, *bGPU; 449 cusparseStatus_t stat; 450 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 451 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 452 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 453 CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec; 454 455 PetscFunctionBegin; 456 // Get the GPU pointers 457 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 458 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 459 460 // solve 461 stat = cusparseMatLo->solve(bGPU, tempGPU);CHKERRCUSP(stat); 462 stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat); 463 464 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 465 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 466 ierr = WaitForGPU();CHKERRCUSP(ierr); 467 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 468 PetscFunctionReturn(0); 469 } 470 471 #undef __FUNCT__ 472 #define __FUNCT__ "MatCUSPARSECopyToGPU" 473 PetscErrorCode MatCUSPARSECopyToGPU(Mat A) 474 { 475 476 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 477 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 478 PetscInt m = A->rmap->n,*ii,*ridx; 479 PetscErrorCode ierr; 480 481 482 PetscFunctionBegin; 483 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){ 484 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 485 /* 486 It may be possible to reuse nonzero structure with new matrix values but 487 for simplicity and insured correctness we delete and build a new matrix on 488 the GPU. Likely a very small performance hit. 489 */ 490 if (cusparseMat->mat){ 491 try { 492 delete cusparseMat->mat; 493 if (cusparseMat->tempvec) 494 delete cusparseMat->tempvec; 495 496 } catch(char* ex) { 497 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 498 } 499 } 500 try { 501 cusparseMat->nonzerorow=0; 502 for (int j = 0; j<m; j++) 503 cusparseMat->nonzerorow += ((a->i[j+1]-a->i[j])>0); 504 505 if (a->compressedrow.use) { 506 m = a->compressedrow.nrows; 507 ii = a->compressedrow.i; 508 ridx = a->compressedrow.rindex; 509 } else { 510 // Forcing compressed row on the GPU ... only relevant for CSR storage 511 int k=0; 512 ierr = PetscMalloc((cusparseMat->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr); 513 ierr = PetscMalloc((cusparseMat->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr); 514 ii[0]=0; 515 for (int j = 0; j<m; j++) { 516 if ((a->i[j+1]-a->i[j])>0) { 517 ii[k] = a->i[j]; 518 ridx[k]= j; 519 k++; 520 } 521 } 522 ii[cusparseMat->nonzerorow] = a->nz; 523 m = cusparseMat->nonzerorow; 524 } 525 526 // Build our matrix ... first determine the GPU storage type 527 cusparseMat->mat = GPU_Matrix_Factory::getNew(cusparseMat->format); 528 529 // Create the streams and events (if desired). 530 PetscMPIInt size; 531 ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr); 532 ierr = cusparseMat->mat->buildStreamsAndEvents(size, &theBodyStream);CHKERRCUSP(ierr); 533 534 // FILL MODE UPPER is irrelevant 535 cusparseStatus_t stat = cusparseMat->mat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 536 537 // lastly, build the matrix 538 ierr = cusparseMat->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);CHKERRCUSP(ierr); 539 cusparseMat->mat->setCPRowIndices(ridx, m); 540 // } 541 if (!a->compressedrow.use) { 542 // free data 543 ierr = PetscFree(ii);CHKERRQ(ierr); 544 ierr = PetscFree(ridx);CHKERRQ(ierr); 545 } 546 cusparseMat->tempvec = new CUSPARRAY; 547 cusparseMat->tempvec->resize(m); 548 //A->spptr = cusparseMat; 549 } catch(char* ex) { 550 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 551 } 552 ierr = WaitForGPU();CHKERRCUSP(ierr); 553 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 554 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 555 } 556 PetscFunctionReturn(0); 557 } 558 559 // #undef __FUNCT__ 560 // #define __FUNCT__ "MatCUSPARSECopyFromGPU" 561 // PetscErrorCode MatCUSPARSECopyFromGPU(Mat A, CUSPMATRIX *Agpu) 562 // { 563 // Mat_SeqAIJCUSPARSE *cuspstruct = (Mat_SeqAIJCUSPARSE *) A->spptr; 564 // Mat_SeqAIJ *a = (Mat_SeqAIJ *) A->data; 565 // PetscInt m = A->rmap->n; 566 // PetscErrorCode ierr; 567 568 // PetscFunctionBegin; 569 // if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED) { 570 // if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED) { 571 // try { 572 // cuspstruct->mat = Agpu; 573 // if (a->compressedrow.use) { 574 // //PetscInt *ii, *ridx; 575 // SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Cannot handle row compression for GPU matrices"); 576 // } else { 577 // PetscInt i; 578 579 // if (m+1 != (PetscInt) cuspstruct->mat->row_offsets.size()) {SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", cuspstruct->mat->row_offsets.size()-1, m);} 580 // a->nz = cuspstruct->mat->values.size(); 581 // a->maxnz = a->nz; /* Since we allocate exactly the right amount */ 582 // A->preallocated = PETSC_TRUE; 583 // // Copy ai, aj, aa 584 // if (a->singlemalloc) { 585 // if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);} 586 // } else { 587 // if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);} 588 // if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);} 589 // if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);} 590 // } 591 // ierr = PetscMalloc3(a->nz,PetscScalar,&a->a,a->nz,PetscInt,&a->j,m+1,PetscInt,&a->i);CHKERRQ(ierr); 592 // ierr = PetscLogObjectMemory(A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr); 593 // a->singlemalloc = PETSC_TRUE; 594 // thrust::copy(cuspstruct->mat->row_offsets.begin(), cuspstruct->mat->row_offsets.end(), a->i); 595 // thrust::copy(cuspstruct->mat->column_indices.begin(), cuspstruct->mat->column_indices.end(), a->j); 596 // thrust::copy(cuspstruct->mat->values.begin(), cuspstruct->mat->values.end(), a->a); 597 // // Setup row lengths 598 // if (a->imax) {ierr = PetscFree2(a->imax,a->ilen);CHKERRQ(ierr);} 599 // ierr = PetscMalloc2(m,PetscInt,&a->imax,m,PetscInt,&a->ilen);CHKERRQ(ierr); 600 // ierr = PetscLogObjectMemory(A, 2*m*sizeof(PetscInt));CHKERRQ(ierr); 601 // for(i = 0; i < m; ++i) { 602 // a->imax[i] = a->ilen[i] = a->i[i+1] - a->i[i]; 603 // } 604 // // a->diag? 605 // } 606 // cuspstruct->tempvec = new CUSPARRAY; 607 // cuspstruct->tempvec->resize(m); 608 // } catch(char *ex) { 609 // SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSP error: %s", ex); 610 // } 611 // } 612 // // This assembly prevents resetting the flag to PETSC_CUSP_CPU and recopying 613 // ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 614 // ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 615 // A->valid_GPU_matrix = PETSC_CUSP_BOTH; 616 // } else { 617 // SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Only valid for unallocated GPU matrices"); 618 // } 619 // PetscFunctionReturn(0); 620 // } 621 622 #undef __FUNCT__ 623 #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE" 624 PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left) 625 { 626 PetscErrorCode ierr; 627 628 PetscFunctionBegin; 629 630 if (right) { 631 ierr = VecCreate(((PetscObject)mat)->comm,right);CHKERRQ(ierr); 632 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 633 ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr); 634 ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr); 635 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 636 } 637 if (left) { 638 ierr = VecCreate(((PetscObject)mat)->comm,left);CHKERRQ(ierr); 639 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 640 ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr); 641 ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr); 642 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 643 } 644 PetscFunctionReturn(0); 645 } 646 647 #undef __FUNCT__ 648 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE" 649 PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 650 { 651 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 652 PetscErrorCode ierr; 653 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr; 654 CUSPARRAY *xarray,*yarray; 655 656 PetscFunctionBegin; 657 // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 658 // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr); 659 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 660 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 661 try { 662 ierr = cusparseMat->mat->multiply(xarray, yarray);CHKERRCUSP(ierr); 663 } catch (char* ex) { 664 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 665 } 666 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 667 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 668 if (!cusparseMat->mat->hasNonZeroStream()) { 669 ierr = WaitForGPU();CHKERRCUSP(ierr); 670 } 671 ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr); 672 PetscFunctionReturn(0); 673 } 674 675 676 #undef __FUNCT__ 677 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE" 678 PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 679 { 680 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 681 PetscErrorCode ierr; 682 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr; 683 CUSPARRAY *xarray,*yarray; 684 685 PetscFunctionBegin; 686 // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 687 // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr); 688 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 689 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 690 try { 691 #if !defined(PETSC_USE_COMPLEX) 692 ierr = cusparseMat->mat->multiply(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr); 693 #else 694 ierr = cusparseMat->mat->multiply(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr); 695 #endif 696 } catch (char* ex) { 697 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 698 } 699 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 700 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 701 if (!cusparseMat->mat->hasNonZeroStream()) { 702 ierr = WaitForGPU();CHKERRCUSP(ierr); 703 } 704 ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr); 705 PetscFunctionReturn(0); 706 } 707 708 #undef __FUNCT__ 709 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 710 PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 711 { 712 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 713 PetscErrorCode ierr; 714 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr; 715 CUSPARRAY *xarray,*yarray,*zarray; 716 PetscFunctionBegin; 717 // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 718 // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr); 719 try { 720 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 721 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 722 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 723 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 724 725 // multiply add 726 ierr = cusparseMat->mat->multiplyAdd(xarray, zarray);CHKERRCUSP(ierr); 727 728 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 729 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 730 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 731 732 } catch(char* ex) { 733 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 734 } 735 ierr = WaitForGPU();CHKERRCUSP(ierr); 736 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 737 PetscFunctionReturn(0); 738 } 739 740 #undef __FUNCT__ 741 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 742 PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 743 { 744 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 745 PetscErrorCode ierr; 746 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr; 747 CUSPARRAY *xarray,*yarray,*zarray; 748 PetscFunctionBegin; 749 // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 750 // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr); 751 try { 752 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 753 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 754 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 755 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 756 757 // multiply add with matrix transpose 758 #if !defined(PETSC_USE_COMPLEX) 759 ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr); 760 #else 761 ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr); 762 #endif 763 764 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 765 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 766 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 767 768 } catch(char* ex) { 769 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 770 } 771 ierr = WaitForGPU();CHKERRCUSP(ierr); 772 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 773 PetscFunctionReturn(0); 774 } 775 776 #undef __FUNCT__ 777 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE" 778 PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 779 { 780 PetscErrorCode ierr; 781 PetscFunctionBegin; 782 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 783 ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr); 784 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 785 // this is not necessary since MatCUSPARSECopyToGPU has been called. 786 //if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED){ 787 // A->valid_GPU_matrix = PETSC_CUSP_CPU; 788 //} 789 A->ops->mult = MatMult_SeqAIJCUSPARSE; 790 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 791 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 792 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 793 PetscFunctionReturn(0); 794 } 795 796 /* --------------------------------------------------------------------------------*/ 797 #undef __FUNCT__ 798 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE" 799 /*@C 800 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 801 (the default parallel PETSc format). For good matrix assembly performance 802 the user should preallocate the matrix storage by setting the parameter nz 803 (or the array nnz). By setting these parameters accurately, performance 804 during matrix assembly can be increased by more than a factor of 50. 805 806 Collective on MPI_Comm 807 808 Input Parameters: 809 + comm - MPI communicator, set to PETSC_COMM_SELF 810 . m - number of rows 811 . n - number of columns 812 . nz - number of nonzeros per row (same for all rows) 813 - nnz - array containing the number of nonzeros in the various rows 814 (possibly different for each row) or PETSC_NULL 815 816 Output Parameter: 817 . A - the matrix 818 819 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 820 MatXXXXSetPreallocation() paradgm instead of this routine directly. 821 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 822 823 Notes: 824 If nnz is given then nz is ignored 825 826 The AIJ format (also called the Yale sparse matrix format or 827 compressed row storage), is fully compatible with standard Fortran 77 828 storage. That is, the stored row and column indices can begin at 829 either one (as in Fortran) or zero. See the users' manual for details. 830 831 Specify the preallocated storage with either nz or nnz (not both). 832 Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory 833 allocation. For large problems you MUST preallocate memory or you 834 will get TERRIBLE performance, see the users' manual chapter on matrices. 835 836 By default, this format uses inodes (identical nodes) when possible, to 837 improve numerical efficiency of matrix-vector products and solves. We 838 search for consecutive rows with the same nonzero structure, thereby 839 reusing matrix information to achieve increased efficiency. 840 841 Level: intermediate 842 843 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 844 845 @*/ 846 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 847 { 848 PetscErrorCode ierr; 849 850 PetscFunctionBegin; 851 ierr = MatCreate(comm,A);CHKERRQ(ierr); 852 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 853 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 854 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 855 PetscFunctionReturn(0); 856 } 857 858 #undef __FUNCT__ 859 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE" 860 PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 861 { 862 PetscErrorCode ierr; 863 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 864 865 PetscFunctionBegin; 866 if (A->factortype==MAT_FACTOR_NONE) { 867 // The regular matrices 868 try { 869 if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED){ 870 delete (GPU_Matrix_Ifc *)(cusparseMat->mat); 871 } 872 if (cusparseMat->tempvec!=0) 873 delete cusparseMat->tempvec; 874 delete cusparseMat; 875 A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 876 } catch(char* ex) { 877 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 878 } 879 } else { 880 // The triangular factors 881 try { 882 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 883 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 884 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 885 // destroy the matrix and the container 886 delete (GPU_Matrix_Ifc *)cusparseMatLo; 887 delete (GPU_Matrix_Ifc *)cusparseMatUp; 888 delete (CUSPARRAY*) cusparseTriFactors->tempvec; 889 delete cusparseTriFactors; 890 } catch(char* ex) { 891 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 892 } 893 } 894 if (MAT_cusparseHandle) { 895 cusparseStatus_t stat; 896 stat = cusparseDestroy(MAT_cusparseHandle);CHKERRCUSP(stat); 897 MAT_cusparseHandle=0; 898 } 899 /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */ 900 A->spptr = 0; 901 902 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 903 PetscFunctionReturn(0); 904 } 905 906 907 EXTERN_C_BEGIN 908 #undef __FUNCT__ 909 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE" 910 PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 911 { 912 PetscErrorCode ierr; 913 GPUStorageFormat format = CSR; 914 915 PetscFunctionBegin; 916 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 917 if (B->factortype==MAT_FACTOR_NONE) { 918 /* you cannot check the inode.use flag here since the matrix was just created.*/ 919 // now build a GPU matrix data structure 920 B->spptr = new Mat_SeqAIJCUSPARSE; 921 ((Mat_SeqAIJCUSPARSE *)B->spptr)->mat = 0; 922 ((Mat_SeqAIJCUSPARSE *)B->spptr)->tempvec = 0; 923 ((Mat_SeqAIJCUSPARSE *)B->spptr)->format = format; 924 } else { 925 /* NEXT, set the pointers to the triangular factors */ 926 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 927 ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->loTriFactorPtr = 0; 928 ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->upTriFactorPtr = 0; 929 ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->tempvec = 0; 930 ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->format = format; 931 } 932 // Create a single instance of the MAT_cusparseHandle for any matrix (matMult, TriSolve, ...) 933 if (!MAT_cusparseHandle) { 934 cusparseStatus_t stat; 935 stat = cusparseCreate(&MAT_cusparseHandle);CHKERRCUSP(stat); 936 } 937 // Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the 938 // default cusparse tri solve. Note the difference with the implementation in 939 // MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu 940 ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatGetFactor_petsc_C","MatGetFactor_seqaij_cusparse",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr); 941 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 942 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 943 B->ops->getvecs = MatGetVecs_SeqAIJCUSPARSE; 944 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 945 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 946 B->ops->mult = MatMult_SeqAIJCUSPARSE; 947 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 948 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 949 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 950 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 951 B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 952 PetscFunctionReturn(0); 953 } 954 EXTERN_C_END 955 956