1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format. 4 */ 5 6 #include "petscconf.h" 7 PETSC_CUDA_EXTERN_C_BEGIN 8 #include "../src/mat/impls/aij/seq/aij.h" /*I "petscmat.h" I*/ 9 //#include "petscbt.h" 10 #include "../src/vec/vec/impls/dvecimpl.h" 11 #include "petsc-private/vecimpl.h" 12 PETSC_CUDA_EXTERN_C_END 13 #undef VecType 14 #include "cusparsematimpl.h" 15 16 // this is such a hack ... but I don't know of another way to pass this variable 17 // from one GPU_Matrix_Ifc class to another. This is necessary for the parallel 18 // SpMV. Essentially, I need to use the same stream variable in two different 19 // data structures. I do this by creating a single instance of that stream 20 // and reuse it. 21 cudaStream_t theBodyStream=0; 22 23 EXTERN_C_BEGIN 24 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 25 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 26 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo *); 27 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 28 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 29 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat); 30 PetscErrorCode MatCUSPARSEAnalysisAndCopyToGPU(Mat); 31 PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 32 PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 33 PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 34 PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 35 EXTERN_C_END 36 37 EXTERN_C_BEGIN 38 #undef __FUNCT__ 39 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse" 40 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type) 41 { 42 PetscFunctionBegin; 43 *type = MATSOLVERCUSPARSE; 44 PetscFunctionReturn(0); 45 } 46 EXTERN_C_END 47 48 EXTERN_C_BEGIN 49 extern PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*); 50 EXTERN_C_END 51 52 53 54 /*MC 55 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers (ILU) for seq matrices 56 on the GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp 57 5~ 58 ./configure --download-txpetscgpu to install PETSc to use CUSPARSE solves 59 60 Options Database Keys: 61 + -mat_mult_cusparse_storage_format <CSR> - (choose one of) CSR, ELL, HYB 62 + -mat_solve_cusparse_storage_format <CSR> - (choose one of) CSR, ELL, HYB 63 64 Level: beginner 65 M*/ 66 67 EXTERN_C_BEGIN 68 #undef __FUNCT__ 69 #define __FUNCT__ "MatGetFactor_seqaij_cusparse" 70 PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B) 71 { 72 PetscErrorCode ierr; 73 74 PetscFunctionBegin; 75 ierr = MatGetFactor_seqaij_petsc(A,ftype,B);CHKERRQ(ierr); 76 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT){ 77 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 78 ierr = MatSetFromOptions_SeqAIJCUSPARSE(*B);CHKERRQ(ierr); 79 ierr = PetscObjectComposeFunctionDynamic((PetscObject)(*B),"MatFactorGetSolverPackage_C","MatFactorGetSolverPackage_seqaij_cusparse",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr); 80 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 81 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 82 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 83 (*B)->factortype = ftype; 84 PetscFunctionReturn(0); 85 } 86 EXTERN_C_END 87 88 #undef __FUNCT__ 89 #define __FUNCT__ "MatAIJCUSPARSESetGPUStorageFormatForMatSolve" 90 PetscErrorCode MatAIJCUSPARSESetGPUStorageFormatForMatSolve(Mat A,GPUStorageFormat format) 91 { 92 PetscErrorCode ierr; 93 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 94 GPU_Matrix_Ifc* cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 95 GPU_Matrix_Ifc* cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 96 PetscFunctionBegin; 97 cusparseTriFactors->format = format; 98 if (cusparseMatLo && cusparseMatUp) { 99 // If data exists already delete 100 if (cusparseMatLo) 101 delete (GPU_Matrix_Ifc *)cusparseMatLo; 102 if (cusparseMatUp) 103 delete (GPU_Matrix_Ifc *)cusparseMatUp; 104 105 // key data was deleted so reset the flag. 106 A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 107 108 // rebuild matrix 109 ierr=MatCUSPARSEAnalysisAndCopyToGPU(A);CHKERRQ(ierr); 110 } 111 PetscFunctionReturn(0); 112 } 113 114 //EXTERN_C_BEGIN 115 #undef __FUNCT__ 116 #define __FUNCT__ "MatSetOption_SeqAIJCUSPARSE" 117 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 118 { 119 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 120 PetscErrorCode ierr; 121 PetscFunctionBegin; 122 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 123 switch (op) { 124 case MAT_DIAGBLOCK_CSR: 125 case MAT_OFFDIAGBLOCK_CSR: 126 case MAT_CSR: 127 cusparseMat->format = CSR; 128 break; 129 case MAT_DIAGBLOCK_DIA: 130 case MAT_OFFDIAGBLOCK_DIA: 131 case MAT_DIA: 132 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unsupported GPU matrix storage format DIA for (MPI,SEQ)AIJCUSPARSE matrix type."); 133 case MAT_DIAGBLOCK_ELL: 134 case MAT_OFFDIAGBLOCK_ELL: 135 case MAT_ELL: 136 cusparseMat->format = ELL; 137 break; 138 case MAT_DIAGBLOCK_HYB: 139 case MAT_OFFDIAGBLOCK_HYB: 140 case MAT_HYB: 141 cusparseMat->format = HYB; 142 break; 143 default: 144 break; 145 } 146 PetscFunctionReturn(0); 147 } 148 //EXTERN_C_END 149 150 //EXTERN_C_BEGIN 151 #undef __FUNCT__ 152 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE" 153 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A) 154 { 155 PetscErrorCode ierr; 156 PetscInt idx=0; 157 char * formats[] ={CSR,ELL,HYB}; 158 MatOption format; 159 PetscBool flg; 160 PetscFunctionBegin; 161 ierr = PetscOptionsBegin(((PetscObject)A)->comm,((PetscObject)A)->prefix,"When using TxPETSCGPU, AIJCUSPARSE Options","Mat");CHKERRQ(ierr); 162 if (A->factortype==MAT_FACTOR_NONE) { 163 ierr = PetscOptionsEList("-mat_mult_cusparse_storage_format", 164 "Set the storage format of (seq)aijcusparse gpu matrices for SpMV", 165 "None",formats,3,formats[0],&idx,&flg);CHKERRQ(ierr); 166 if (formats[idx] == CSR) 167 format=MAT_CSR; 168 else if (formats[idx] == ELL) 169 format=MAT_ELL; 170 else if (formats[idx] == HYB) 171 format=MAT_HYB; 172 173 ierr=MatSetOption_SeqAIJCUSPARSE(A,format,PETSC_TRUE);CHKERRQ(ierr); 174 } 175 else { 176 ierr = PetscOptionsEList("-mat_solve_cusparse_storage_format", 177 "Set the storage format of (seq)aijcusparse gpu matrices for TriSolve", 178 "None",formats,3,formats[0],&idx,&flg);CHKERRQ(ierr); 179 ierr=MatAIJCUSPARSESetGPUStorageFormatForMatSolve(A,formats[idx]);CHKERRQ(ierr); 180 } 181 ierr = PetscOptionsEnd();CHKERRQ(ierr); 182 PetscFunctionReturn(0); 183 184 } 185 //EXTERN_C_END 186 187 #undef __FUNCT__ 188 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE" 189 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 190 { 191 PetscErrorCode ierr; 192 193 PetscFunctionBegin; 194 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 195 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 196 PetscFunctionReturn(0); 197 } 198 199 #undef __FUNCT__ 200 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE" 201 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 202 { 203 PetscErrorCode ierr; 204 205 PetscFunctionBegin; 206 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 207 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 208 PetscFunctionReturn(0); 209 } 210 211 #undef __FUNCT__ 212 #define __FUNCT__ "MatCUSPARSEBuildLowerTriMatrix" 213 PetscErrorCode MatCUSPARSEBuildLowerTriMatrix(Mat A) 214 { 215 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 216 PetscInt n = A->rmap->n; 217 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 218 GPU_Matrix_Ifc* cusparseMat = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 219 cusparseStatus_t stat; 220 const PetscInt *ai = a->i,*aj = a->j,*vi; 221 const MatScalar *aa = a->a,*v; 222 PetscErrorCode ierr; 223 PetscInt *AiLo, *AjLo; 224 PetscScalar *AALo; 225 PetscInt i,nz, nzLower, offset, rowOffset; 226 227 PetscFunctionBegin; 228 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){ 229 try { 230 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 231 nzLower=n+ai[n]-ai[1]; 232 233 /* Allocate Space for the lower triangular matrix */ 234 ierr = cudaMallocHost((void **) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 235 ierr = cudaMallocHost((void **) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr); 236 ierr = cudaMallocHost((void **) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr); 237 238 /* Fill the lower triangular matrix */ 239 AiLo[0]=(PetscInt) 0; 240 AiLo[n]=nzLower; 241 AjLo[0]=(PetscInt) 0; 242 AALo[0]=(MatScalar) 1.0; 243 v = aa; 244 vi = aj; 245 offset=1; 246 rowOffset=1; 247 for (i=1; i<n; i++) { 248 nz = ai[i+1] - ai[i]; 249 // additional 1 for the term on the diagonal 250 AiLo[i]=rowOffset; 251 rowOffset+=nz+1; 252 253 ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 254 ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 255 256 offset+=nz; 257 AjLo[offset]=(PetscInt) i; 258 AALo[offset]=(MatScalar) 1.0; 259 offset+=1; 260 261 v += nz; 262 vi += nz; 263 } 264 cusparseMat = GPU_Matrix_Factory::getNew(cusparseTriFactors->format); 265 stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat); 266 ierr = cusparseMat->setMatrix(n, n, nzLower, AiLo, AjLo, AALo);CHKERRCUSP(ierr); 267 stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat); 268 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMat; 269 // free temporary data 270 ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr); 271 ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr); 272 ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 273 } catch(char* ex) { 274 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 275 } 276 } 277 PetscFunctionReturn(0); 278 } 279 280 #undef __FUNCT__ 281 #define __FUNCT__ "MatCUSPARSEBuildUpperTriMatrix" 282 PetscErrorCode MatCUSPARSEBuildUpperTriMatrix(Mat A) 283 { 284 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 285 PetscInt n = A->rmap->n; 286 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 287 GPU_Matrix_Ifc* cusparseMat = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 288 cusparseStatus_t stat; 289 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 290 const MatScalar *aa = a->a,*v; 291 PetscInt *AiUp, *AjUp; 292 PetscScalar *AAUp; 293 PetscInt i,nz, nzUpper, offset; 294 PetscErrorCode ierr; 295 296 PetscFunctionBegin; 297 298 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){ 299 try { 300 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 301 nzUpper = adiag[0]-adiag[n]; 302 303 /* Allocate Space for the upper triangular matrix */ 304 ierr = cudaMallocHost((void **) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 305 ierr = cudaMallocHost((void **) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 306 ierr = cudaMallocHost((void **) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 307 308 /* Fill the upper triangular matrix */ 309 AiUp[0]=(PetscInt) 0; 310 AiUp[n]=nzUpper; 311 offset = nzUpper; 312 for (i=n-1; i>=0; i--){ 313 v = aa + adiag[i+1] + 1; 314 vi = aj + adiag[i+1] + 1; 315 316 // number of elements NOT on the diagonal 317 nz = adiag[i] - adiag[i+1]-1; 318 319 // decrement the offset 320 offset -= (nz+1); 321 322 // first, set the diagonal elements 323 AjUp[offset] = (PetscInt) i; 324 AAUp[offset] = 1./v[nz]; 325 AiUp[i] = AiUp[i+1] - (nz+1); 326 327 ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 328 ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 329 } 330 cusparseMat = GPU_Matrix_Factory::getNew(cusparseTriFactors->format); 331 stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 332 ierr = cusparseMat->setMatrix(n, n, nzUpper, AiUp, AjUp, AAUp);CHKERRCUSP(ierr); 333 stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat); 334 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMat; 335 // free temporary data 336 ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 337 ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 338 ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 339 } catch(char* ex) { 340 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 341 } 342 } 343 PetscFunctionReturn(0); 344 } 345 346 #undef __FUNCT__ 347 #define __FUNCT__ "MatCUSPARSEAnalysiAndCopyToGPU" 348 PetscErrorCode MatCUSPARSEAnalysisAndCopyToGPU(Mat A) 349 { 350 PetscErrorCode ierr; 351 Mat_SeqAIJ *a=(Mat_SeqAIJ *)A->data; 352 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 353 IS isrow = a->row,iscol = a->icol; 354 PetscBool row_identity,col_identity; 355 const PetscInt *r,*c; 356 PetscInt n = A->rmap->n; 357 358 PetscFunctionBegin; 359 ierr = MatCUSPARSEBuildLowerTriMatrix(A);CHKERRQ(ierr); 360 ierr = MatCUSPARSEBuildUpperTriMatrix(A);CHKERRQ(ierr); 361 cusparseTriFactors->tempvec = new CUSPARRAY; 362 cusparseTriFactors->tempvec->resize(n); 363 364 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 365 //lower triangular indices 366 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 367 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 368 if (!row_identity) 369 ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(r, n);CHKERRCUSP(ierr); 370 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 371 372 //upper triangular indices 373 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 374 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 375 if (!col_identity) 376 ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(c, n);CHKERRCUSP(ierr); 377 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 378 PetscFunctionReturn(0); 379 } 380 381 #undef __FUNCT__ 382 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE" 383 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 384 { 385 PetscErrorCode ierr; 386 Mat_SeqAIJ *b=(Mat_SeqAIJ *)B->data; 387 IS isrow = b->row,iscol = b->col; 388 PetscBool row_identity,col_identity; 389 390 PetscFunctionBegin; 391 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 392 // determine which version of MatSolve needs to be used. 393 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 394 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 395 if (row_identity && col_identity) B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 396 else B->ops->solve = MatSolve_SeqAIJCUSPARSE; 397 398 // Not sure why this is necessary but somehow the function pointers got reset 399 A->ops->mult = MatMult_SeqAIJCUSPARSE; 400 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 401 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 402 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 403 404 //if (!row_identity) printf("Row permutations exist!"); 405 //if (!col_identity) printf("Col permutations exist!"); 406 407 // get the triangular factors 408 ierr = MatCUSPARSEAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 409 PetscFunctionReturn(0); 410 } 411 412 413 414 #undef __FUNCT__ 415 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE" 416 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 417 { 418 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 419 PetscErrorCode ierr; 420 CUSPARRAY *xGPU, *bGPU; 421 cusparseStatus_t stat; 422 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 423 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 424 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 425 CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec; 426 427 PetscFunctionBegin; 428 // Get the GPU pointers 429 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 430 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 431 432 // solve with reordering 433 ierr = cusparseMatLo->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr); 434 stat = cusparseMatLo->solve(xGPU, tempGPU);CHKERRCUSP(stat); 435 stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat); 436 ierr = cusparseMatUp->reorderOut(xGPU);CHKERRCUSP(ierr); 437 438 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 439 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 440 ierr = WaitForGPU();CHKERRCUSP(ierr); 441 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 442 PetscFunctionReturn(0); 443 } 444 445 446 447 448 #undef __FUNCT__ 449 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering" 450 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 451 { 452 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 453 PetscErrorCode ierr; 454 CUSPARRAY *xGPU, *bGPU; 455 cusparseStatus_t stat; 456 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 457 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 458 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 459 CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec; 460 461 PetscFunctionBegin; 462 // Get the GPU pointers 463 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 464 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 465 466 // solve 467 stat = cusparseMatLo->solve(bGPU, tempGPU);CHKERRCUSP(stat); 468 stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat); 469 470 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 471 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 472 ierr = WaitForGPU();CHKERRCUSP(ierr); 473 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 474 PetscFunctionReturn(0); 475 } 476 477 #undef __FUNCT__ 478 #define __FUNCT__ "MatCUSPARSECopyToGPU" 479 PetscErrorCode MatCUSPARSECopyToGPU(Mat A) 480 { 481 482 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 483 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 484 PetscInt m = A->rmap->n,*ii,*ridx; 485 PetscErrorCode ierr; 486 487 488 PetscFunctionBegin; 489 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){ 490 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 491 /* 492 It may be possible to reuse nonzero structure with new matrix values but 493 for simplicity and insured correctness we delete and build a new matrix on 494 the GPU. Likely a very small performance hit. 495 */ 496 if (cusparseMat->mat){ 497 try { 498 delete cusparseMat->mat; 499 if (cusparseMat->tempvec) 500 delete cusparseMat->tempvec; 501 502 } catch(char* ex) { 503 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 504 } 505 } 506 try { 507 cusparseMat->nonzerorow=0; 508 for (int j = 0; j<m; j++) 509 cusparseMat->nonzerorow += ((a->i[j+1]-a->i[j])>0); 510 511 if (a->compressedrow.use) { 512 m = a->compressedrow.nrows; 513 ii = a->compressedrow.i; 514 ridx = a->compressedrow.rindex; 515 } else { 516 // Forcing compressed row on the GPU ... only relevant for CSR storage 517 int k=0; 518 ierr = PetscMalloc((cusparseMat->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr); 519 ierr = PetscMalloc((cusparseMat->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr); 520 ii[0]=0; 521 for (int j = 0; j<m; j++) { 522 if ((a->i[j+1]-a->i[j])>0) { 523 ii[k] = a->i[j]; 524 ridx[k]= j; 525 k++; 526 } 527 } 528 ii[cusparseMat->nonzerorow] = a->nz; 529 m = cusparseMat->nonzerorow; 530 } 531 532 // Build our matrix ... first determine the GPU storage type 533 cusparseMat->mat = GPU_Matrix_Factory::getNew(cusparseMat->format); 534 535 // Create the streams and events (if desired). 536 PetscMPIInt size; 537 ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr); 538 ierr = cusparseMat->mat->buildStreamsAndEvents(size, &theBodyStream);CHKERRCUSP(ierr); 539 540 // FILL MODE UPPER is irrelevant 541 cusparseStatus_t stat = cusparseMat->mat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 542 543 // lastly, build the matrix 544 ierr = cusparseMat->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);CHKERRCUSP(ierr); 545 cusparseMat->mat->setCPRowIndices(ridx, m); 546 // } 547 if (!a->compressedrow.use) { 548 // free data 549 ierr = PetscFree(ii);CHKERRQ(ierr); 550 ierr = PetscFree(ridx);CHKERRQ(ierr); 551 } 552 cusparseMat->tempvec = new CUSPARRAY; 553 cusparseMat->tempvec->resize(m); 554 //A->spptr = cusparseMat; 555 } catch(char* ex) { 556 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 557 } 558 ierr = WaitForGPU();CHKERRCUSP(ierr); 559 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 560 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 561 } 562 PetscFunctionReturn(0); 563 } 564 565 // #undef __FUNCT__ 566 // #define __FUNCT__ "MatCUSPARSECopyFromGPU" 567 // PetscErrorCode MatCUSPARSECopyFromGPU(Mat A, CUSPMATRIX *Agpu) 568 // { 569 // Mat_SeqAIJCUSPARSE *cuspstruct = (Mat_SeqAIJCUSPARSE *) A->spptr; 570 // Mat_SeqAIJ *a = (Mat_SeqAIJ *) A->data; 571 // PetscInt m = A->rmap->n; 572 // PetscErrorCode ierr; 573 574 // PetscFunctionBegin; 575 // if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED) { 576 // if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED) { 577 // try { 578 // cuspstruct->mat = Agpu; 579 // if (a->compressedrow.use) { 580 // //PetscInt *ii, *ridx; 581 // SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Cannot handle row compression for GPU matrices"); 582 // } else { 583 // PetscInt i; 584 585 // if (m+1 != (PetscInt) cuspstruct->mat->row_offsets.size()) {SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", cuspstruct->mat->row_offsets.size()-1, m);} 586 // a->nz = cuspstruct->mat->values.size(); 587 // a->maxnz = a->nz; /* Since we allocate exactly the right amount */ 588 // A->preallocated = PETSC_TRUE; 589 // // Copy ai, aj, aa 590 // if (a->singlemalloc) { 591 // if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);} 592 // } else { 593 // if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);} 594 // if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);} 595 // if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);} 596 // } 597 // ierr = PetscMalloc3(a->nz,PetscScalar,&a->a,a->nz,PetscInt,&a->j,m+1,PetscInt,&a->i);CHKERRQ(ierr); 598 // ierr = PetscLogObjectMemory(A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr); 599 // a->singlemalloc = PETSC_TRUE; 600 // thrust::copy(cuspstruct->mat->row_offsets.begin(), cuspstruct->mat->row_offsets.end(), a->i); 601 // thrust::copy(cuspstruct->mat->column_indices.begin(), cuspstruct->mat->column_indices.end(), a->j); 602 // thrust::copy(cuspstruct->mat->values.begin(), cuspstruct->mat->values.end(), a->a); 603 // // Setup row lengths 604 // if (a->imax) {ierr = PetscFree2(a->imax,a->ilen);CHKERRQ(ierr);} 605 // ierr = PetscMalloc2(m,PetscInt,&a->imax,m,PetscInt,&a->ilen);CHKERRQ(ierr); 606 // ierr = PetscLogObjectMemory(A, 2*m*sizeof(PetscInt));CHKERRQ(ierr); 607 // for(i = 0; i < m; ++i) { 608 // a->imax[i] = a->ilen[i] = a->i[i+1] - a->i[i]; 609 // } 610 // // a->diag? 611 // } 612 // cuspstruct->tempvec = new CUSPARRAY; 613 // cuspstruct->tempvec->resize(m); 614 // } catch(char *ex) { 615 // SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSP error: %s", ex); 616 // } 617 // } 618 // // This assembly prevents resetting the flag to PETSC_CUSP_CPU and recopying 619 // ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 620 // ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 621 // A->valid_GPU_matrix = PETSC_CUSP_BOTH; 622 // } else { 623 // SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Only valid for unallocated GPU matrices"); 624 // } 625 // PetscFunctionReturn(0); 626 // } 627 628 #undef __FUNCT__ 629 #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE" 630 PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left) 631 { 632 PetscErrorCode ierr; 633 634 PetscFunctionBegin; 635 636 if (right) { 637 ierr = VecCreate(((PetscObject)mat)->comm,right);CHKERRQ(ierr); 638 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 639 ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr); 640 ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr); 641 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 642 } 643 if (left) { 644 ierr = VecCreate(((PetscObject)mat)->comm,left);CHKERRQ(ierr); 645 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 646 ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr); 647 ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr); 648 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 649 } 650 PetscFunctionReturn(0); 651 } 652 653 #undef __FUNCT__ 654 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE" 655 PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 656 { 657 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 658 PetscErrorCode ierr; 659 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr; 660 CUSPARRAY *xarray,*yarray; 661 662 PetscFunctionBegin; 663 // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 664 // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr); 665 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 666 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 667 try { 668 ierr = cusparseMat->mat->multiply(xarray, yarray);CHKERRCUSP(ierr); 669 } catch (char* ex) { 670 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 671 } 672 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 673 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 674 if (!cusparseMat->mat->hasNonZeroStream()) { 675 ierr = WaitForGPU();CHKERRCUSP(ierr); 676 } 677 ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr); 678 PetscFunctionReturn(0); 679 } 680 681 682 #undef __FUNCT__ 683 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE" 684 PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 685 { 686 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 687 PetscErrorCode ierr; 688 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr; 689 CUSPARRAY *xarray,*yarray; 690 691 PetscFunctionBegin; 692 // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 693 // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr); 694 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 695 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 696 try { 697 #if !defined(PETSC_USE_COMPLEX) 698 ierr = cusparseMat->mat->multiply(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr); 699 #else 700 ierr = cusparseMat->mat->multiply(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr); 701 #endif 702 } catch (char* ex) { 703 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 704 } 705 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 706 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 707 if (!cusparseMat->mat->hasNonZeroStream()) { 708 ierr = WaitForGPU();CHKERRCUSP(ierr); 709 } 710 ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr); 711 PetscFunctionReturn(0); 712 } 713 714 #undef __FUNCT__ 715 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 716 PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 717 { 718 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 719 PetscErrorCode ierr; 720 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr; 721 CUSPARRAY *xarray,*yarray,*zarray; 722 PetscFunctionBegin; 723 // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 724 // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr); 725 try { 726 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 727 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 728 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 729 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 730 731 // multiply add 732 ierr = cusparseMat->mat->multiplyAdd(xarray, zarray);CHKERRCUSP(ierr); 733 734 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 735 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 736 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 737 738 } catch(char* ex) { 739 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 740 } 741 ierr = WaitForGPU();CHKERRCUSP(ierr); 742 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 743 PetscFunctionReturn(0); 744 } 745 746 #undef __FUNCT__ 747 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 748 PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 749 { 750 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 751 PetscErrorCode ierr; 752 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr; 753 CUSPARRAY *xarray,*yarray,*zarray; 754 PetscFunctionBegin; 755 // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 756 // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr); 757 try { 758 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 759 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 760 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 761 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 762 763 // multiply add with matrix transpose 764 #if !defined(PETSC_USE_COMPLEX) 765 ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr); 766 #else 767 ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr); 768 #endif 769 770 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 771 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 772 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 773 774 } catch(char* ex) { 775 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 776 } 777 ierr = WaitForGPU();CHKERRCUSP(ierr); 778 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 779 PetscFunctionReturn(0); 780 } 781 782 #undef __FUNCT__ 783 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE" 784 PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 785 { 786 PetscErrorCode ierr; 787 PetscFunctionBegin; 788 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 789 ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr); 790 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 791 // this is not necessary since MatCUSPARSECopyToGPU has been called. 792 //if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED){ 793 // A->valid_GPU_matrix = PETSC_CUSP_CPU; 794 //} 795 A->ops->mult = MatMult_SeqAIJCUSPARSE; 796 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 797 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 798 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 799 PetscFunctionReturn(0); 800 } 801 802 /* --------------------------------------------------------------------------------*/ 803 #undef __FUNCT__ 804 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE" 805 /*@C 806 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 807 (the default parallel PETSc format). For good matrix assembly performance 808 the user should preallocate the matrix storage by setting the parameter nz 809 (or the array nnz). By setting these parameters accurately, performance 810 during matrix assembly can be increased by more than a factor of 50. 811 812 Collective on MPI_Comm 813 814 Input Parameters: 815 + comm - MPI communicator, set to PETSC_COMM_SELF 816 . m - number of rows 817 . n - number of columns 818 . nz - number of nonzeros per row (same for all rows) 819 - nnz - array containing the number of nonzeros in the various rows 820 (possibly different for each row) or PETSC_NULL 821 822 Output Parameter: 823 . A - the matrix 824 825 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 826 MatXXXXSetPreallocation() paradgm instead of this routine directly. 827 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 828 829 Notes: 830 If nnz is given then nz is ignored 831 832 The AIJ format (also called the Yale sparse matrix format or 833 compressed row storage), is fully compatible with standard Fortran 77 834 storage. That is, the stored row and column indices can begin at 835 either one (as in Fortran) or zero. See the users' manual for details. 836 837 Specify the preallocated storage with either nz or nnz (not both). 838 Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory 839 allocation. For large problems you MUST preallocate memory or you 840 will get TERRIBLE performance, see the users' manual chapter on matrices. 841 842 By default, this format uses inodes (identical nodes) when possible, to 843 improve numerical efficiency of matrix-vector products and solves. We 844 search for consecutive rows with the same nonzero structure, thereby 845 reusing matrix information to achieve increased efficiency. 846 847 Level: intermediate 848 849 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 850 851 @*/ 852 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 853 { 854 PetscErrorCode ierr; 855 856 PetscFunctionBegin; 857 ierr = MatCreate(comm,A);CHKERRQ(ierr); 858 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 859 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 860 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 861 PetscFunctionReturn(0); 862 } 863 864 #undef __FUNCT__ 865 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE" 866 PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 867 { 868 PetscErrorCode ierr; 869 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 870 871 PetscFunctionBegin; 872 if (A->factortype==MAT_FACTOR_NONE) { 873 // The regular matrices 874 try { 875 if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED){ 876 delete (GPU_Matrix_Ifc *)(cusparseMat->mat); 877 } 878 if (cusparseMat->tempvec!=0) 879 delete cusparseMat->tempvec; 880 delete cusparseMat; 881 A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 882 } catch(char* ex) { 883 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 884 } 885 } else { 886 // The triangular factors 887 try { 888 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 889 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 890 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 891 // destroy the matrix and the container 892 delete (GPU_Matrix_Ifc *)cusparseMatLo; 893 delete (GPU_Matrix_Ifc *)cusparseMatUp; 894 delete (CUSPARRAY*) cusparseTriFactors->tempvec; 895 delete cusparseTriFactors; 896 } catch(char* ex) { 897 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 898 } 899 } 900 if (MAT_cusparseHandle) { 901 cusparseStatus_t stat; 902 stat = cusparseDestroy(MAT_cusparseHandle);CHKERRCUSP(stat); 903 MAT_cusparseHandle=0; 904 } 905 /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */ 906 A->spptr = 0; 907 908 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 909 PetscFunctionReturn(0); 910 } 911 912 913 EXTERN_C_BEGIN 914 #undef __FUNCT__ 915 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE" 916 PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 917 { 918 PetscErrorCode ierr; 919 GPUStorageFormat format = CSR; 920 921 PetscFunctionBegin; 922 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 923 if (B->factortype==MAT_FACTOR_NONE) { 924 /* you cannot check the inode.use flag here since the matrix was just created.*/ 925 // now build a GPU matrix data structure 926 B->spptr = new Mat_SeqAIJCUSPARSE; 927 ((Mat_SeqAIJCUSPARSE *)B->spptr)->mat = 0; 928 ((Mat_SeqAIJCUSPARSE *)B->spptr)->tempvec = 0; 929 ((Mat_SeqAIJCUSPARSE *)B->spptr)->format = format; 930 } else { 931 /* NEXT, set the pointers to the triangular factors */ 932 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 933 ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->loTriFactorPtr = 0; 934 ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->upTriFactorPtr = 0; 935 ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->tempvec = 0; 936 ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->format = format; 937 } 938 // Create a single instance of the MAT_cusparseHandle for any matrix (matMult, TriSolve, ...) 939 if (!MAT_cusparseHandle) { 940 cusparseStatus_t stat; 941 stat = cusparseCreate(&MAT_cusparseHandle);CHKERRCUSP(stat); 942 } 943 // Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the 944 // default cusparse tri solve. Note the difference with the implementation in 945 // MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu 946 ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatGetFactor_petsc_C","MatGetFactor_seqaij_cusparse",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr); 947 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 948 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 949 B->ops->getvecs = MatGetVecs_SeqAIJCUSPARSE; 950 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 951 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 952 B->ops->mult = MatMult_SeqAIJCUSPARSE; 953 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 954 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 955 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 956 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 957 B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 958 PetscFunctionReturn(0); 959 } 960 EXTERN_C_END 961 962