1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format. 4 */ 5 6 #include "petscconf.h" 7 PETSC_CUDA_EXTERN_C_BEGIN 8 #include "../src/mat/impls/aij/seq/aij.h" /*I "petscmat.h" I*/ 9 //#include "petscbt.h" 10 #include "../src/vec/vec/impls/dvecimpl.h" 11 #include "petsc-private/vecimpl.h" 12 PETSC_CUDA_EXTERN_C_END 13 #undef VecType 14 #include "cusparsematimpl.h" 15 16 17 EXTERN_C_BEGIN 18 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 19 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 20 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo *); 21 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 22 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 23 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat); 24 PetscErrorCode MatCUSPARSEAnalysisAndCopyToGPU(Mat); 25 EXTERN_C_END 26 27 EXTERN_C_BEGIN 28 #undef __FUNCT__ 29 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse" 30 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type) 31 { 32 PetscFunctionBegin; 33 *type = MATSOLVERCUSPARSE; 34 PetscFunctionReturn(0); 35 } 36 EXTERN_C_END 37 38 EXTERN_C_BEGIN 39 extern PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*); 40 EXTERN_C_END 41 42 43 44 /*MC 45 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers (ILU) for seq matrices 46 on the GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp 47 48 ./configure --download-txpetscgpu to install PETSc to use CUSPARSE solves 49 50 Options Database Keys: 51 + -mat_mult_cusparse_storage_format <CSR> - (choose one of) CSR, ELL 52 + -mat_solve_cusparse_storage_format <CSR> - (choose one of) CSR, ELL 53 54 Level: beginner 55 M*/ 56 57 EXTERN_C_BEGIN 58 #undef __FUNCT__ 59 #define __FUNCT__ "MatGetFactor_seqaij_cusparse" 60 PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B) 61 { 62 PetscErrorCode ierr; 63 64 PetscFunctionBegin; 65 ierr = MatGetFactor_seqaij_petsc(A,ftype,B);CHKERRQ(ierr); 66 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT){ 67 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 68 ierr = MatSetFromOptions_SeqAIJCUSPARSE(*B);CHKERRQ(ierr); 69 ierr = PetscObjectComposeFunctionDynamic((PetscObject)(*B),"MatFactorGetSolverPackage_C","MatFactorGetSolverPackage_seqaij_cusparse",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr); 70 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 71 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 72 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 73 (*B)->factortype = ftype; 74 PetscFunctionReturn(0); 75 } 76 EXTERN_C_END 77 78 #undef __FUNCT__ 79 #define __FUNCT__ "MatAIJCUSPARSESetGPUStorageFormatForMatMult" 80 PetscErrorCode MatAIJCUSPARSESetGPUStorageFormatForMatMult(Mat A,const GPUStorageFormat format) 81 { 82 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 83 PetscFunctionBegin; 84 cusparseMat->format = format; 85 PetscFunctionReturn(0); 86 } 87 88 #undef __FUNCT__ 89 #define __FUNCT__ "MatAIJCUSPARSESetGPUStorageFormatForMatSolve" 90 PetscErrorCode MatAIJCUSPARSESetGPUStorageFormatForMatSolve(Mat A,const GPUStorageFormat format) 91 { 92 PetscErrorCode ierr; 93 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 94 GPU_Matrix_Ifc* cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 95 GPU_Matrix_Ifc* cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 96 PetscFunctionBegin; 97 cusparseTriFactors->format = format; 98 if (cusparseMatLo && cusparseMatUp) { 99 // If data exists already delete 100 if (cusparseMatLo) 101 delete (GPU_Matrix_Ifc *)cusparseMatLo; 102 if (cusparseMatUp) 103 delete (GPU_Matrix_Ifc *)cusparseMatUp; 104 105 // key data was deleted so reset the flag. 106 A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 107 108 // rebuild matrix 109 ierr=MatCUSPARSEAnalysisAndCopyToGPU(A);CHKERRQ(ierr); 110 } 111 PetscFunctionReturn(0); 112 } 113 114 115 #undef __FUNCT__ 116 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE" 117 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A) 118 { 119 PetscErrorCode ierr; 120 PetscInt idx=0; 121 const char * formats[]={CSR,ELL}; 122 PetscBool flg; 123 PetscFunctionBegin; 124 ierr = PetscOptionsBegin(((PetscObject)A)->comm,((PetscObject)A)->prefix,"When using TxPETSCGPU, AIJCUSPARSE Options","Mat");CHKERRQ(ierr); 125 if (A->factortype==MAT_FACTOR_NONE) { 126 ierr = PetscOptionsEList("-mat_mult_cusparse_storage_format", 127 "Set the storage format of (seq)aijcusparse gpu matrices for SpMV", 128 "None",formats,2,formats[0],&idx,&flg);CHKERRQ(ierr); 129 ierr=MatAIJCUSPARSESetGPUStorageFormatForMatMult(A,formats[idx]);CHKERRQ(ierr); 130 } 131 else { 132 ierr = PetscOptionsEList("-mat_solve_cusparse_storage_format", 133 "Set the storage format of (seq)aijcusparse gpu matrices for TriSolve", 134 "None",formats,2,formats[0],&idx,&flg);CHKERRQ(ierr); 135 ierr=MatAIJCUSPARSESetGPUStorageFormatForMatSolve(A,formats[idx]);CHKERRQ(ierr); 136 } 137 ierr = PetscOptionsEnd();CHKERRQ(ierr); 138 PetscFunctionReturn(0); 139 140 } 141 142 #undef __FUNCT__ 143 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE" 144 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 145 { 146 PetscErrorCode ierr; 147 148 PetscFunctionBegin; 149 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 150 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 151 PetscFunctionReturn(0); 152 } 153 154 #undef __FUNCT__ 155 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE" 156 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 157 { 158 PetscErrorCode ierr; 159 160 PetscFunctionBegin; 161 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 162 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 163 PetscFunctionReturn(0); 164 } 165 166 #undef __FUNCT__ 167 #define __FUNCT__ "MatCUSPARSEBuildLowerTriMatrix" 168 PetscErrorCode MatCUSPARSEBuildLowerTriMatrix(Mat A) 169 { 170 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 171 PetscInt n = A->rmap->n; 172 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 173 GPU_Matrix_Ifc* cusparseMat = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 174 cusparseStatus_t stat; 175 const PetscInt *ai = a->i,*aj = a->j,*vi; 176 const MatScalar *aa = a->a,*v; 177 PetscErrorCode ierr; 178 PetscInt *AiLo, *AjLo; 179 PetscScalar *AALo; 180 PetscInt i,nz, nzLower, offset, rowOffset; 181 182 PetscFunctionBegin; 183 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){ 184 try { 185 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 186 nzLower=n+ai[n]-ai[1]; 187 188 /* Allocate Space for the lower triangular matrix */ 189 ierr = cudaMallocHost((void **) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 190 ierr = cudaMallocHost((void **) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr); 191 ierr = cudaMallocHost((void **) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr); 192 193 /* Fill the lower triangular matrix */ 194 AiLo[0]=(PetscInt) 0; 195 AiLo[n]=nzLower; 196 AjLo[0]=(PetscInt) 0; 197 AALo[0]=(MatScalar) 1.0; 198 v = aa; 199 vi = aj; 200 offset=1; 201 rowOffset=1; 202 for (i=1; i<n; i++) { 203 nz = ai[i+1] - ai[i]; 204 // additional 1 for the term on the diagonal 205 AiLo[i]=rowOffset; 206 rowOffset+=nz+1; 207 208 ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 209 ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 210 211 offset+=nz; 212 AjLo[offset]=(PetscInt) i; 213 AALo[offset]=(MatScalar) 1.0; 214 offset+=1; 215 216 v += nz; 217 vi += nz; 218 } 219 cusparseMat = GPU_Matrix_Factory::getNew(cusparseTriFactors->format); 220 stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat); 221 ierr = cusparseMat->setMatrix(n, n, nzLower, AiLo, AjLo, AALo);CHKERRCUSP(ierr); 222 stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat); 223 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMat; 224 // free temporary data 225 ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr); 226 ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr); 227 ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 228 } catch(char* ex) { 229 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 230 } 231 } 232 PetscFunctionReturn(0); 233 } 234 235 #undef __FUNCT__ 236 #define __FUNCT__ "MatCUSPARSEBuildUpperTriMatrix" 237 PetscErrorCode MatCUSPARSEBuildUpperTriMatrix(Mat A) 238 { 239 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 240 PetscInt n = A->rmap->n; 241 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 242 GPU_Matrix_Ifc* cusparseMat = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 243 cusparseStatus_t stat; 244 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 245 const MatScalar *aa = a->a,*v; 246 PetscInt *AiUp, *AjUp; 247 PetscScalar *AAUp; 248 PetscInt i,nz, nzUpper, offset; 249 PetscErrorCode ierr; 250 251 PetscFunctionBegin; 252 253 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){ 254 try { 255 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 256 nzUpper = adiag[0]-adiag[n]; 257 258 /* Allocate Space for the upper triangular matrix */ 259 ierr = cudaMallocHost((void **) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 260 ierr = cudaMallocHost((void **) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 261 ierr = cudaMallocHost((void **) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 262 263 /* Fill the upper triangular matrix */ 264 AiUp[0]=(PetscInt) 0; 265 AiUp[n]=nzUpper; 266 offset = nzUpper; 267 for (i=n-1; i>=0; i--){ 268 v = aa + adiag[i+1] + 1; 269 vi = aj + adiag[i+1] + 1; 270 271 // number of elements NOT on the diagonal 272 nz = adiag[i] - adiag[i+1]-1; 273 274 // decrement the offset 275 offset -= (nz+1); 276 277 // first, set the diagonal elements 278 AjUp[offset] = (PetscInt) i; 279 AAUp[offset] = 1./v[nz]; 280 AiUp[i] = AiUp[i+1] - (nz+1); 281 282 ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 283 ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 284 } 285 cusparseMat = GPU_Matrix_Factory::getNew(cusparseTriFactors->format); 286 stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 287 ierr = cusparseMat->setMatrix(n, n, nzUpper, AiUp, AjUp, AAUp);CHKERRCUSP(ierr); 288 stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat); 289 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMat; 290 // free temporary data 291 ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 292 ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 293 ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 294 } catch(char* ex) { 295 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 296 } 297 } 298 PetscFunctionReturn(0); 299 } 300 301 #undef __FUNCT__ 302 #define __FUNCT__ "MatCUSPARSEAnalysiAndCopyToGPU" 303 PetscErrorCode MatCUSPARSEAnalysisAndCopyToGPU(Mat A) 304 { 305 PetscErrorCode ierr; 306 Mat_SeqAIJ *a=(Mat_SeqAIJ *)A->data; 307 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 308 IS isrow = a->row,iscol = a->icol; 309 PetscBool row_identity,col_identity; 310 const PetscInt *r,*c; 311 PetscInt n = A->rmap->n; 312 313 PetscFunctionBegin; 314 ierr = MatCUSPARSEBuildLowerTriMatrix(A);CHKERRQ(ierr); 315 ierr = MatCUSPARSEBuildUpperTriMatrix(A);CHKERRQ(ierr); 316 cusparseTriFactors->tempvec = new CUSPARRAY; 317 cusparseTriFactors->tempvec->resize(n); 318 319 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 320 //lower triangular indices 321 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 322 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 323 if (!row_identity) 324 ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(r, n);CHKERRCUSP(ierr); 325 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 326 327 //upper triangular indices 328 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 329 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 330 if (!col_identity) 331 ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(c, n);CHKERRCUSP(ierr); 332 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 333 PetscFunctionReturn(0); 334 } 335 336 #undef __FUNCT__ 337 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE" 338 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 339 { 340 PetscErrorCode ierr; 341 Mat_SeqAIJ *b=(Mat_SeqAIJ *)B->data; 342 IS isrow = b->row,iscol = b->col; 343 PetscBool row_identity,col_identity; 344 345 PetscFunctionBegin; 346 std::cout << "1 MatLUFactorNumeric_SeqAIJCUSPARSE"<<std::endl; 347 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 348 std::cout << "2 MatLUFactorNumeric_SeqAIJCUSPARSE"<<std::endl; 349 // determine which version of MatSolve needs to be used. 350 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 351 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 352 if (row_identity && col_identity) B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 353 else B->ops->solve = MatSolve_SeqAIJCUSPARSE; 354 if (!row_identity) printf("Row permutations exist!"); 355 if (!col_identity) printf("Col permutations exist!"); 356 357 // get the triangular factors 358 ierr = MatCUSPARSEAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 359 std::cout << "3 MatLUFactorNumeric_SeqAIJCUSPARSE"<<std::endl; 360 PetscFunctionReturn(0); 361 } 362 363 364 365 #undef __FUNCT__ 366 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE" 367 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 368 { 369 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 370 PetscErrorCode ierr; 371 CUSPARRAY *xGPU, *bGPU; 372 cusparseStatus_t stat; 373 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 374 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 375 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 376 CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec; 377 378 PetscFunctionBegin; 379 // Get the GPU pointers 380 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 381 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 382 383 // solve with reordering 384 ierr = cusparseMatLo->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr); 385 stat = cusparseMatLo->solve(xGPU, tempGPU);CHKERRCUSP(stat); 386 stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat); 387 ierr = cusparseMatUp->reorderOut(xGPU);CHKERRCUSP(ierr); 388 389 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 390 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 391 ierr = WaitForGPU();CHKERRCUSP(ierr); 392 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 393 PetscFunctionReturn(0); 394 } 395 396 397 398 399 #undef __FUNCT__ 400 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering" 401 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 402 { 403 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 404 PetscErrorCode ierr; 405 CUSPARRAY *xGPU, *bGPU; 406 cusparseStatus_t stat; 407 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 408 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 409 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 410 CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec; 411 412 PetscFunctionBegin; 413 // Get the GPU pointers 414 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 415 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 416 417 // solve 418 stat = cusparseMatLo->solve(bGPU, tempGPU);CHKERRCUSP(stat); 419 stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat); 420 421 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 422 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 423 ierr = WaitForGPU();CHKERRCUSP(ierr); 424 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 425 PetscFunctionReturn(0); 426 } 427 428 #undef __FUNCT__ 429 #define __FUNCT__ "MatCUSPARSECopyToGPU" 430 PetscErrorCode MatCUSPARSECopyToGPU(Mat A) 431 { 432 433 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 434 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 435 PetscInt m = A->rmap->n,*ii,*ridx; 436 PetscErrorCode ierr; 437 438 439 PetscFunctionBegin; 440 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){ 441 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 442 /* 443 It may be possible to reuse nonzero structure with new matrix values but 444 for simplicity and insured correctness we delete and build a new matrix on 445 the GPU. Likely a very small performance hit. 446 */ 447 if (cusparseMat->mat){ 448 try { 449 delete cusparseMat->mat; 450 if (cusparseMat->tempvec) 451 delete cusparseMat->tempvec; 452 453 } catch(char* ex) { 454 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 455 } 456 } 457 try { 458 cusparseMat->nonzerorow=0; 459 for (int j = 0; j<m; j++) 460 cusparseMat->nonzerorow += ((a->i[j+1]-a->i[j])>0); 461 462 if (a->compressedrow.use) { 463 m = a->compressedrow.nrows; 464 ii = a->compressedrow.i; 465 ridx = a->compressedrow.rindex; 466 } else { 467 // Forcing compressed row on the GPU ... only relevant for CSR storage 468 int k=0; 469 ierr = PetscMalloc((cusparseMat->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr); 470 ierr = PetscMalloc((cusparseMat->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr); 471 ii[0]=0; 472 for (int j = 0; j<m; j++) { 473 if ((a->i[j+1]-a->i[j])>0) { 474 ii[k] = a->i[j]; 475 ridx[k]= j; 476 k++; 477 } 478 } 479 ii[cusparseMat->nonzerorow] = a->nz; 480 m = cusparseMat->nonzerorow; 481 } 482 483 // Build our matrix ... first determine the GPU storage type 484 cusparseMat->mat = GPU_Matrix_Factory::getNew(cusparseMat->format); 485 486 // FILL MODE UPPER is irrelevant 487 cusparseStatus_t stat = cusparseMat->mat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 488 489 // Create the streams and events (if desired). 490 PetscMPIInt size; 491 ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr); 492 ierr = cusparseMat->mat->buildStreamsAndEvents(size, &theBodyStream);CHKERRCUSP(ierr); 493 494 // lastly, build the matrix 495 ierr = cusparseMat->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);CHKERRCUSP(ierr); 496 cusparseMat->mat->setCPRowIndices(ridx, m); 497 // } 498 if (!a->compressedrow.use) { 499 // free data 500 ierr = PetscFree(ii);CHKERRQ(ierr); 501 ierr = PetscFree(ridx);CHKERRQ(ierr); 502 } 503 504 cusparseMat->tempvec = new CUSPARRAY; 505 cusparseMat->tempvec->resize(m); 506 } catch(char* ex) { 507 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 508 } 509 ierr = WaitForGPU();CHKERRCUSP(ierr); 510 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 511 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 512 } 513 PetscFunctionReturn(0); 514 } 515 516 // #undef __FUNCT__ 517 // #define __FUNCT__ "MatCUSPARSECopyFromGPU" 518 // PetscErrorCode MatCUSPARSECopyFromGPU(Mat A, CUSPMATRIX *Agpu) 519 // { 520 // Mat_SeqAIJCUSPARSE *cuspstruct = (Mat_SeqAIJCUSPARSE *) A->spptr; 521 // Mat_SeqAIJ *a = (Mat_SeqAIJ *) A->data; 522 // PetscInt m = A->rmap->n; 523 // PetscErrorCode ierr; 524 525 // PetscFunctionBegin; 526 // if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED) { 527 // if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED) { 528 // try { 529 // cuspstruct->mat = Agpu; 530 // if (a->compressedrow.use) { 531 // //PetscInt *ii, *ridx; 532 // SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Cannot handle row compression for GPU matrices"); 533 // } else { 534 // PetscInt i; 535 536 // if (m+1 != (PetscInt) cuspstruct->mat->row_offsets.size()) {SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", cuspstruct->mat->row_offsets.size()-1, m);} 537 // a->nz = cuspstruct->mat->values.size(); 538 // a->maxnz = a->nz; /* Since we allocate exactly the right amount */ 539 // A->preallocated = PETSC_TRUE; 540 // // Copy ai, aj, aa 541 // if (a->singlemalloc) { 542 // if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);} 543 // } else { 544 // if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);} 545 // if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);} 546 // if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);} 547 // } 548 // ierr = PetscMalloc3(a->nz,PetscScalar,&a->a,a->nz,PetscInt,&a->j,m+1,PetscInt,&a->i);CHKERRQ(ierr); 549 // ierr = PetscLogObjectMemory(A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr); 550 // a->singlemalloc = PETSC_TRUE; 551 // thrust::copy(cuspstruct->mat->row_offsets.begin(), cuspstruct->mat->row_offsets.end(), a->i); 552 // thrust::copy(cuspstruct->mat->column_indices.begin(), cuspstruct->mat->column_indices.end(), a->j); 553 // thrust::copy(cuspstruct->mat->values.begin(), cuspstruct->mat->values.end(), a->a); 554 // // Setup row lengths 555 // if (a->imax) {ierr = PetscFree2(a->imax,a->ilen);CHKERRQ(ierr);} 556 // ierr = PetscMalloc2(m,PetscInt,&a->imax,m,PetscInt,&a->ilen);CHKERRQ(ierr); 557 // ierr = PetscLogObjectMemory(A, 2*m*sizeof(PetscInt));CHKERRQ(ierr); 558 // for(i = 0; i < m; ++i) { 559 // a->imax[i] = a->ilen[i] = a->i[i+1] - a->i[i]; 560 // } 561 // // a->diag? 562 // } 563 // cuspstruct->tempvec = new CUSPARRAY; 564 // cuspstruct->tempvec->resize(m); 565 // } catch(char *ex) { 566 // SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSP error: %s", ex); 567 // } 568 // } 569 // // This assembly prevents resetting the flag to PETSC_CUSP_CPU and recopying 570 // ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 571 // ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 572 // A->valid_GPU_matrix = PETSC_CUSP_BOTH; 573 // } else { 574 // SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Only valid for unallocated GPU matrices"); 575 // } 576 // PetscFunctionReturn(0); 577 // } 578 579 #undef __FUNCT__ 580 #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE" 581 PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left) 582 { 583 PetscErrorCode ierr; 584 585 PetscFunctionBegin; 586 587 if (right) { 588 ierr = VecCreate(((PetscObject)mat)->comm,right);CHKERRQ(ierr); 589 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 590 ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr); 591 ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr); 592 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 593 } 594 if (left) { 595 ierr = VecCreate(((PetscObject)mat)->comm,left);CHKERRQ(ierr); 596 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 597 ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr); 598 ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr); 599 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 600 } 601 PetscFunctionReturn(0); 602 } 603 604 #undef __FUNCT__ 605 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE" 606 PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 607 { 608 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 609 PetscErrorCode ierr; 610 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr; 611 CUSPARRAY *xarray,*yarray; 612 613 PetscFunctionBegin; 614 // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 615 // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr); 616 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 617 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 618 try { 619 ierr = cusparseMat->mat->multiply(xarray, yarray);CHKERRCUSP(ierr); 620 } catch (char* ex) { 621 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 622 } 623 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 624 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 625 if (!cusparseMat->mat->hasNonZeroStream()) 626 ierr = WaitForGPU();CHKERRCUSP(ierr); 627 ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr); 628 PetscFunctionReturn(0); 629 } 630 631 632 #undef __FUNCT__ 633 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 634 PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 635 { 636 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 637 PetscErrorCode ierr; 638 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr; 639 CUSPARRAY *xarray,*yarray,*zarray; 640 PetscFunctionBegin; 641 // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 642 // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr); 643 try { 644 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 645 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 646 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 647 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 648 649 // multiply add 650 ierr = cusparseMat->mat->multiplyAdd(xarray, zarray);CHKERRCUSP(ierr); 651 652 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 653 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 654 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 655 656 } catch(char* ex) { 657 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 658 } 659 ierr = WaitForGPU();CHKERRCUSP(ierr); 660 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 661 PetscFunctionReturn(0); 662 } 663 664 #undef __FUNCT__ 665 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE" 666 PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 667 { 668 PetscErrorCode ierr; 669 PetscFunctionBegin; 670 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 671 ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr); 672 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 673 // this is not necessary since MatCUSPARSECopyToGPU has been called. 674 //if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED){ 675 // A->valid_GPU_matrix = PETSC_CUSP_CPU; 676 //} 677 A->ops->mult = MatMult_SeqAIJCUSPARSE; 678 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 679 PetscFunctionReturn(0); 680 } 681 682 /* --------------------------------------------------------------------------------*/ 683 #undef __FUNCT__ 684 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE" 685 /*@C 686 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 687 (the default parallel PETSc format). For good matrix assembly performance 688 the user should preallocate the matrix storage by setting the parameter nz 689 (or the array nnz). By setting these parameters accurately, performance 690 during matrix assembly can be increased by more than a factor of 50. 691 692 Collective on MPI_Comm 693 694 Input Parameters: 695 + comm - MPI communicator, set to PETSC_COMM_SELF 696 . m - number of rows 697 . n - number of columns 698 . nz - number of nonzeros per row (same for all rows) 699 - nnz - array containing the number of nonzeros in the various rows 700 (possibly different for each row) or PETSC_NULL 701 702 Output Parameter: 703 . A - the matrix 704 705 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 706 MatXXXXSetPreallocation() paradgm instead of this routine directly. 707 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 708 709 Notes: 710 If nnz is given then nz is ignored 711 712 The AIJ format (also called the Yale sparse matrix format or 713 compressed row storage), is fully compatible with standard Fortran 77 714 storage. That is, the stored row and column indices can begin at 715 either one (as in Fortran) or zero. See the users' manual for details. 716 717 Specify the preallocated storage with either nz or nnz (not both). 718 Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory 719 allocation. For large problems you MUST preallocate memory or you 720 will get TERRIBLE performance, see the users' manual chapter on matrices. 721 722 By default, this format uses inodes (identical nodes) when possible, to 723 improve numerical efficiency of matrix-vector products and solves. We 724 search for consecutive rows with the same nonzero structure, thereby 725 reusing matrix information to achieve increased efficiency. 726 727 Level: intermediate 728 729 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 730 731 @*/ 732 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 733 { 734 PetscErrorCode ierr; 735 736 PetscFunctionBegin; 737 ierr = MatCreate(comm,A);CHKERRQ(ierr); 738 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 739 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 740 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 741 PetscFunctionReturn(0); 742 } 743 744 #undef __FUNCT__ 745 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE" 746 PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 747 { 748 PetscErrorCode ierr; 749 Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 750 751 PetscFunctionBegin; 752 if (A->factortype==MAT_FACTOR_NONE) { 753 // The regular matrices 754 try { 755 if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED){ 756 delete (GPU_Matrix_Ifc *)(cusparseMat->mat); 757 } 758 if (cusparseMat->tempvec!=0) 759 delete cusparseMat->tempvec; 760 delete cusparseMat; 761 A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 762 } catch(char* ex) { 763 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 764 } 765 } else { 766 // The triangular factors 767 try { 768 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 769 GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 770 GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 771 // destroy the matrix and the container 772 delete (GPU_Matrix_Ifc *)cusparseMatLo; 773 delete (GPU_Matrix_Ifc *)cusparseMatUp; 774 delete (CUSPARRAY*) cusparseTriFactors->tempvec; 775 delete cusparseTriFactors; 776 } catch(char* ex) { 777 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 778 } 779 } 780 if (MAT_cusparseHandle) { 781 cusparseStatus_t stat; 782 stat = cusparseDestroy(MAT_cusparseHandle);CHKERRCUSP(stat); 783 MAT_cusparseHandle=0; 784 } 785 /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */ 786 A->spptr = 0; 787 788 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 789 PetscFunctionReturn(0); 790 } 791 792 793 EXTERN_C_BEGIN 794 #undef __FUNCT__ 795 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE" 796 PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 797 { 798 PetscErrorCode ierr; 799 800 PetscFunctionBegin; 801 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 802 B->ops->mult = MatMult_SeqAIJCUSPARSE; 803 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 804 805 if (B->factortype==MAT_FACTOR_NONE) { 806 /* you cannot check the inode.use flag here since the matrix was just created.*/ 807 B->spptr = new Mat_SeqAIJCUSPARSE; 808 ((Mat_SeqAIJCUSPARSE *)B->spptr)->mat = 0; 809 ((Mat_SeqAIJCUSPARSE *)B->spptr)->tempvec = 0; 810 ((Mat_SeqAIJCUSPARSE *)B->spptr)->format = CSR; 811 } else { 812 /* NEXT, set the pointers to the triangular factors */ 813 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 814 ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->loTriFactorPtr = 0; 815 ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->upTriFactorPtr = 0; 816 ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->tempvec = 0; 817 ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->format = CSR; 818 } 819 // Create a single instance of the MAT_cusparseHandle for any matrix (matMult, TriSolve, ...) 820 if (!MAT_cusparseHandle) { 821 cusparseStatus_t stat; 822 stat = cusparseCreate(&MAT_cusparseHandle);CHKERRCUSP(stat); 823 } 824 // Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the 825 // default cusparse tri solve. Note the difference with the implementation in 826 // MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu 827 ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatGetFactor_petsc_C","MatGetFactor_seqaij_cusparse",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr); 828 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 829 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 830 B->ops->getvecs = MatGetVecs_SeqAIJCUSPARSE; 831 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 832 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 833 B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 834 PetscFunctionReturn(0); 835 } 836 EXTERN_C_END 837 838