1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_CXX_COMPLEX_FIX 7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 8 9 #include <petscconf.h> 10 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11 #include <../src/mat/impls/sbaij/seq/sbaij.h> 12 #include <../src/vec/vec/impls/dvecimpl.h> 13 #include <petsc/private/vecimpl.h> 14 #undef VecType 15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 19 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 20 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 21 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 22 23 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 24 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 25 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 26 27 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 28 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 29 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 30 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 31 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 32 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 33 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 34 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 35 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 36 37 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 38 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 39 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 40 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 41 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 42 43 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 44 { 45 cusparseStatus_t stat; 46 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 47 48 PetscFunctionBegin; 49 cusparsestruct->stream = stream; 50 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 51 PetscFunctionReturn(0); 52 } 53 54 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 55 { 56 cusparseStatus_t stat; 57 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 58 59 PetscFunctionBegin; 60 if (cusparsestruct->handle != handle) { 61 if (cusparsestruct->handle) { 62 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 63 } 64 cusparsestruct->handle = handle; 65 } 66 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 67 PetscFunctionReturn(0); 68 } 69 70 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 71 { 72 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 73 PetscFunctionBegin; 74 if (cusparsestruct->handle) 75 cusparsestruct->handle = 0; 76 PetscFunctionReturn(0); 77 } 78 79 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 80 { 81 PetscFunctionBegin; 82 *type = MATSOLVERCUSPARSE; 83 PetscFunctionReturn(0); 84 } 85 86 /*MC 87 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 88 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 89 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 90 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 91 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 92 algorithms are not recommended. This class does NOT support direct solver operations. 93 94 Level: beginner 95 96 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 97 M*/ 98 99 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 100 { 101 PetscErrorCode ierr; 102 PetscInt n = A->rmap->n; 103 104 PetscFunctionBegin; 105 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 106 (*B)->factortype = ftype; 107 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 108 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 109 110 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 111 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 112 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 113 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 114 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 115 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 116 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 117 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 118 119 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 120 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 121 PetscFunctionReturn(0); 122 } 123 124 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 125 { 126 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 127 128 PetscFunctionBegin; 129 switch (op) { 130 case MAT_CUSPARSE_MULT: 131 cusparsestruct->format = format; 132 break; 133 case MAT_CUSPARSE_ALL: 134 cusparsestruct->format = format; 135 break; 136 default: 137 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 138 } 139 PetscFunctionReturn(0); 140 } 141 142 /*@ 143 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 144 operation. Only the MatMult operation can use different GPU storage formats 145 for MPIAIJCUSPARSE matrices. 146 Not Collective 147 148 Input Parameters: 149 + A - Matrix of type SEQAIJCUSPARSE 150 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 151 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 152 153 Output Parameter: 154 155 Level: intermediate 156 157 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 158 @*/ 159 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 160 { 161 PetscErrorCode ierr; 162 163 PetscFunctionBegin; 164 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 165 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 166 PetscFunctionReturn(0); 167 } 168 169 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 170 { 171 PetscErrorCode ierr; 172 MatCUSPARSEStorageFormat format; 173 PetscBool flg; 174 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 175 176 PetscFunctionBegin; 177 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 178 if (A->factortype==MAT_FACTOR_NONE) { 179 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 180 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 181 if (flg) { 182 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 183 } 184 } 185 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 186 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 187 if (flg) { 188 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 189 } 190 ierr = PetscOptionsTail();CHKERRQ(ierr); 191 PetscFunctionReturn(0); 192 193 } 194 195 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 196 { 197 PetscErrorCode ierr; 198 199 PetscFunctionBegin; 200 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 201 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 202 PetscFunctionReturn(0); 203 } 204 205 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 206 { 207 PetscErrorCode ierr; 208 209 PetscFunctionBegin; 210 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 211 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 212 PetscFunctionReturn(0); 213 } 214 215 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 216 { 217 PetscErrorCode ierr; 218 219 PetscFunctionBegin; 220 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 221 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 222 PetscFunctionReturn(0); 223 } 224 225 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 226 { 227 PetscErrorCode ierr; 228 229 PetscFunctionBegin; 230 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 231 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 232 PetscFunctionReturn(0); 233 } 234 235 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 236 { 237 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 238 PetscInt n = A->rmap->n; 239 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 240 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 241 cusparseStatus_t stat; 242 const PetscInt *ai = a->i,*aj = a->j,*vi; 243 const MatScalar *aa = a->a,*v; 244 PetscInt *AiLo, *AjLo; 245 PetscScalar *AALo; 246 PetscInt i,nz, nzLower, offset, rowOffset; 247 PetscErrorCode ierr; 248 cudaError_t cerr; 249 250 PetscFunctionBegin; 251 if (!n) PetscFunctionReturn(0); 252 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 253 try { 254 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 255 nzLower=n+ai[n]-ai[1]; 256 257 /* Allocate Space for the lower triangular matrix */ 258 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 259 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 260 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 261 262 /* Fill the lower triangular matrix */ 263 AiLo[0] = (PetscInt) 0; 264 AiLo[n] = nzLower; 265 AjLo[0] = (PetscInt) 0; 266 AALo[0] = (MatScalar) 1.0; 267 v = aa; 268 vi = aj; 269 offset = 1; 270 rowOffset= 1; 271 for (i=1; i<n; i++) { 272 nz = ai[i+1] - ai[i]; 273 /* additional 1 for the term on the diagonal */ 274 AiLo[i] = rowOffset; 275 rowOffset += nz+1; 276 277 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 278 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 279 280 offset += nz; 281 AjLo[offset] = (PetscInt) i; 282 AALo[offset] = (MatScalar) 1.0; 283 offset += 1; 284 285 v += nz; 286 vi += nz; 287 } 288 289 /* allocate space for the triangular factor information */ 290 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 291 292 /* Create the matrix description */ 293 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 294 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 295 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 296 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 297 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 298 299 /* Create the solve analysis information */ 300 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 301 302 /* set the operation */ 303 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 304 305 /* set the matrix */ 306 loTriFactor->csrMat = new CsrMatrix; 307 loTriFactor->csrMat->num_rows = n; 308 loTriFactor->csrMat->num_cols = n; 309 loTriFactor->csrMat->num_entries = nzLower; 310 311 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 312 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 313 314 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 315 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 316 317 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 318 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 319 320 /* perform the solve analysis */ 321 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 322 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 323 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 324 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 325 326 /* assign the pointer. Is this really necessary? */ 327 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 328 329 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 330 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 331 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 332 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 333 } catch(char *ex) { 334 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 335 } 336 } 337 PetscFunctionReturn(0); 338 } 339 340 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 341 { 342 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 343 PetscInt n = A->rmap->n; 344 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 345 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 346 cusparseStatus_t stat; 347 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 348 const MatScalar *aa = a->a,*v; 349 PetscInt *AiUp, *AjUp; 350 PetscScalar *AAUp; 351 PetscInt i,nz, nzUpper, offset; 352 PetscErrorCode ierr; 353 cudaError_t cerr; 354 355 PetscFunctionBegin; 356 if (!n) PetscFunctionReturn(0); 357 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 358 try { 359 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 360 nzUpper = adiag[0]-adiag[n]; 361 362 /* Allocate Space for the upper triangular matrix */ 363 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 364 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 365 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 366 367 /* Fill the upper triangular matrix */ 368 AiUp[0]=(PetscInt) 0; 369 AiUp[n]=nzUpper; 370 offset = nzUpper; 371 for (i=n-1; i>=0; i--) { 372 v = aa + adiag[i+1] + 1; 373 vi = aj + adiag[i+1] + 1; 374 375 /* number of elements NOT on the diagonal */ 376 nz = adiag[i] - adiag[i+1]-1; 377 378 /* decrement the offset */ 379 offset -= (nz+1); 380 381 /* first, set the diagonal elements */ 382 AjUp[offset] = (PetscInt) i; 383 AAUp[offset] = (MatScalar)1./v[nz]; 384 AiUp[i] = AiUp[i+1] - (nz+1); 385 386 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 387 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 388 } 389 390 /* allocate space for the triangular factor information */ 391 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 392 393 /* Create the matrix description */ 394 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 395 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 396 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 397 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 398 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 399 400 /* Create the solve analysis information */ 401 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 402 403 /* set the operation */ 404 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 405 406 /* set the matrix */ 407 upTriFactor->csrMat = new CsrMatrix; 408 upTriFactor->csrMat->num_rows = n; 409 upTriFactor->csrMat->num_cols = n; 410 upTriFactor->csrMat->num_entries = nzUpper; 411 412 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 413 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 414 415 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 416 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 417 418 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 419 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 420 421 /* perform the solve analysis */ 422 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 423 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 424 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 425 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 426 427 /* assign the pointer. Is this really necessary? */ 428 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 429 430 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 431 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 432 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 433 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 434 } catch(char *ex) { 435 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 436 } 437 } 438 PetscFunctionReturn(0); 439 } 440 441 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 442 { 443 PetscErrorCode ierr; 444 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 445 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 446 IS isrow = a->row,iscol = a->icol; 447 PetscBool row_identity,col_identity; 448 const PetscInt *r,*c; 449 PetscInt n = A->rmap->n; 450 451 PetscFunctionBegin; 452 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 453 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 454 455 cusparseTriFactors->workVector = new THRUSTARRAY(n); 456 cusparseTriFactors->nnz=a->nz; 457 458 A->offloadmask = PETSC_OFFLOAD_BOTH; 459 /* lower triangular indices */ 460 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 461 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 462 if (!row_identity) { 463 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 464 cusparseTriFactors->rpermIndices->assign(r, r+n); 465 } 466 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 467 468 /* upper triangular indices */ 469 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 470 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 471 if (!col_identity) { 472 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 473 cusparseTriFactors->cpermIndices->assign(c, c+n); 474 } 475 476 if (!row_identity && !col_identity) { 477 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 478 } else if(!row_identity) { 479 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 480 } else if(!col_identity) { 481 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 482 } 483 484 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 485 PetscFunctionReturn(0); 486 } 487 488 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 489 { 490 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 491 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 492 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 493 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 494 cusparseStatus_t stat; 495 PetscErrorCode ierr; 496 cudaError_t cerr; 497 PetscInt *AiUp, *AjUp; 498 PetscScalar *AAUp; 499 PetscScalar *AALo; 500 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 501 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 502 const PetscInt *ai = b->i,*aj = b->j,*vj; 503 const MatScalar *aa = b->a,*v; 504 505 PetscFunctionBegin; 506 if (!n) PetscFunctionReturn(0); 507 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 508 try { 509 /* Allocate Space for the upper triangular matrix */ 510 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 511 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 512 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 513 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 514 515 /* Fill the upper triangular matrix */ 516 AiUp[0]=(PetscInt) 0; 517 AiUp[n]=nzUpper; 518 offset = 0; 519 for (i=0; i<n; i++) { 520 /* set the pointers */ 521 v = aa + ai[i]; 522 vj = aj + ai[i]; 523 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 524 525 /* first, set the diagonal elements */ 526 AjUp[offset] = (PetscInt) i; 527 AAUp[offset] = (MatScalar)1.0/v[nz]; 528 AiUp[i] = offset; 529 AALo[offset] = (MatScalar)1.0/v[nz]; 530 531 offset+=1; 532 if (nz>0) { 533 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 534 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 535 for (j=offset; j<offset+nz; j++) { 536 AAUp[j] = -AAUp[j]; 537 AALo[j] = AAUp[j]/v[nz]; 538 } 539 offset+=nz; 540 } 541 } 542 543 /* allocate space for the triangular factor information */ 544 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 545 546 /* Create the matrix description */ 547 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 548 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 549 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 550 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 551 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 552 553 /* Create the solve analysis information */ 554 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 555 556 /* set the operation */ 557 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 558 559 /* set the matrix */ 560 upTriFactor->csrMat = new CsrMatrix; 561 upTriFactor->csrMat->num_rows = A->rmap->n; 562 upTriFactor->csrMat->num_cols = A->cmap->n; 563 upTriFactor->csrMat->num_entries = a->nz; 564 565 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 566 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 567 568 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 569 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 570 571 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 572 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 573 574 /* perform the solve analysis */ 575 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 576 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 577 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 578 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 579 580 /* assign the pointer. Is this really necessary? */ 581 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 582 583 /* allocate space for the triangular factor information */ 584 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 585 586 /* Create the matrix description */ 587 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 588 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 589 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 590 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 591 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 592 593 /* Create the solve analysis information */ 594 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 595 596 /* set the operation */ 597 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 598 599 /* set the matrix */ 600 loTriFactor->csrMat = new CsrMatrix; 601 loTriFactor->csrMat->num_rows = A->rmap->n; 602 loTriFactor->csrMat->num_cols = A->cmap->n; 603 loTriFactor->csrMat->num_entries = a->nz; 604 605 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 606 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 607 608 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 609 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 610 611 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 612 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 613 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 614 615 /* perform the solve analysis */ 616 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 617 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 618 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 619 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 620 621 /* assign the pointer. Is this really necessary? */ 622 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 623 624 A->offloadmask = PETSC_OFFLOAD_BOTH; 625 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 626 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 627 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 628 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 629 } catch(char *ex) { 630 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 631 } 632 } 633 PetscFunctionReturn(0); 634 } 635 636 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 637 { 638 PetscErrorCode ierr; 639 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 640 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 641 IS ip = a->row; 642 const PetscInt *rip; 643 PetscBool perm_identity; 644 PetscInt n = A->rmap->n; 645 646 PetscFunctionBegin; 647 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 648 cusparseTriFactors->workVector = new THRUSTARRAY(n); 649 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 650 651 /* lower triangular indices */ 652 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 653 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 654 if (!perm_identity) { 655 IS iip; 656 const PetscInt *irip; 657 658 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 659 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 660 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 661 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 662 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 663 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 664 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 665 ierr = ISDestroy(&iip);CHKERRQ(ierr); 666 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 667 } 668 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 669 PetscFunctionReturn(0); 670 } 671 672 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 673 { 674 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 675 IS isrow = b->row,iscol = b->col; 676 PetscBool row_identity,col_identity; 677 PetscErrorCode ierr; 678 679 PetscFunctionBegin; 680 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 681 /* determine which version of MatSolve needs to be used. */ 682 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 683 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 684 if (row_identity && col_identity) { 685 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 686 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 687 B->ops->matsolve = NULL; 688 B->ops->matsolvetranspose = NULL; 689 } else { 690 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 691 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 692 B->ops->matsolve = NULL; 693 B->ops->matsolvetranspose = NULL; 694 } 695 696 /* get the triangular factors */ 697 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 698 PetscFunctionReturn(0); 699 } 700 701 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 702 { 703 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 704 IS ip = b->row; 705 PetscBool perm_identity; 706 PetscErrorCode ierr; 707 708 PetscFunctionBegin; 709 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 710 711 /* determine which version of MatSolve needs to be used. */ 712 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 713 if (perm_identity) { 714 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 715 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 716 B->ops->matsolve = NULL; 717 B->ops->matsolvetranspose = NULL; 718 } else { 719 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 720 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 721 B->ops->matsolve = NULL; 722 B->ops->matsolvetranspose = NULL; 723 } 724 725 /* get the triangular factors */ 726 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 727 PetscFunctionReturn(0); 728 } 729 730 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 731 { 732 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 733 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 734 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 735 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 736 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 737 cusparseStatus_t stat; 738 cusparseIndexBase_t indexBase; 739 cusparseMatrixType_t matrixType; 740 cusparseFillMode_t fillMode; 741 cusparseDiagType_t diagType; 742 743 PetscFunctionBegin; 744 745 /*********************************************/ 746 /* Now the Transpose of the Lower Tri Factor */ 747 /*********************************************/ 748 749 /* allocate space for the transpose of the lower triangular factor */ 750 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 751 752 /* set the matrix descriptors of the lower triangular factor */ 753 matrixType = cusparseGetMatType(loTriFactor->descr); 754 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 755 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 756 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 757 diagType = cusparseGetMatDiagType(loTriFactor->descr); 758 759 /* Create the matrix description */ 760 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 761 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 762 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 763 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 764 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 765 766 /* Create the solve analysis information */ 767 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 768 769 /* set the operation */ 770 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 771 772 /* allocate GPU space for the CSC of the lower triangular factor*/ 773 loTriFactorT->csrMat = new CsrMatrix; 774 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 775 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 776 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 777 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 778 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 779 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 780 781 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 782 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 783 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 784 loTriFactor->csrMat->values->data().get(), 785 loTriFactor->csrMat->row_offsets->data().get(), 786 loTriFactor->csrMat->column_indices->data().get(), 787 loTriFactorT->csrMat->values->data().get(), 788 loTriFactorT->csrMat->column_indices->data().get(), 789 loTriFactorT->csrMat->row_offsets->data().get(), 790 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 791 792 /* perform the solve analysis on the transposed matrix */ 793 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 794 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 795 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 796 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 797 loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 798 799 /* assign the pointer. Is this really necessary? */ 800 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 801 802 /*********************************************/ 803 /* Now the Transpose of the Upper Tri Factor */ 804 /*********************************************/ 805 806 /* allocate space for the transpose of the upper triangular factor */ 807 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 808 809 /* set the matrix descriptors of the upper triangular factor */ 810 matrixType = cusparseGetMatType(upTriFactor->descr); 811 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 812 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 813 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 814 diagType = cusparseGetMatDiagType(upTriFactor->descr); 815 816 /* Create the matrix description */ 817 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 818 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 819 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 820 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 821 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 822 823 /* Create the solve analysis information */ 824 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 825 826 /* set the operation */ 827 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 828 829 /* allocate GPU space for the CSC of the upper triangular factor*/ 830 upTriFactorT->csrMat = new CsrMatrix; 831 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 832 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 833 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 834 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 835 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 836 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 837 838 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 839 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 840 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 841 upTriFactor->csrMat->values->data().get(), 842 upTriFactor->csrMat->row_offsets->data().get(), 843 upTriFactor->csrMat->column_indices->data().get(), 844 upTriFactorT->csrMat->values->data().get(), 845 upTriFactorT->csrMat->column_indices->data().get(), 846 upTriFactorT->csrMat->row_offsets->data().get(), 847 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 848 849 /* perform the solve analysis on the transposed matrix */ 850 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 851 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 852 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 853 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 854 upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 855 856 /* assign the pointer. Is this really necessary? */ 857 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 858 PetscFunctionReturn(0); 859 } 860 861 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 862 { 863 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 864 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 865 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 866 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 867 cusparseStatus_t stat; 868 cusparseIndexBase_t indexBase; 869 cudaError_t err; 870 PetscErrorCode ierr; 871 872 PetscFunctionBegin; 873 874 /* allocate space for the triangular factor information */ 875 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 876 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 877 indexBase = cusparseGetMatIndexBase(matstruct->descr); 878 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 879 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 880 881 /* set alpha and beta */ 882 err = cudaMalloc((void **)&(matstructT->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 883 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 884 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 885 err = cudaMemcpy(matstructT->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 886 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 887 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 888 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 889 890 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 891 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 892 CsrMatrix *matrixT= new CsrMatrix; 893 matrixT->num_rows = A->cmap->n; 894 matrixT->num_cols = A->rmap->n; 895 matrixT->num_entries = a->nz; 896 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 897 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 898 matrixT->values = new THRUSTARRAY(a->nz); 899 900 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 901 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 902 /* compute the transpose, i.e. the CSC */ 903 indexBase = cusparseGetMatIndexBase(matstruct->descr); 904 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 905 A->cmap->n, matrix->num_entries, 906 matrix->values->data().get(), 907 cusparsestruct->rowoffsets_gpu->data().get(), 908 matrix->column_indices->data().get(), 909 matrixT->values->data().get(), 910 matrixT->column_indices->data().get(), 911 matrixT->row_offsets->data().get(), 912 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 913 /* assign the pointer */ 914 matstructT->mat = matrixT; 915 ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 916 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 917 /* First convert HYB to CSR */ 918 CsrMatrix *temp= new CsrMatrix; 919 temp->num_rows = A->rmap->n; 920 temp->num_cols = A->cmap->n; 921 temp->num_entries = a->nz; 922 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 923 temp->column_indices = new THRUSTINTARRAY32(a->nz); 924 temp->values = new THRUSTARRAY(a->nz); 925 926 927 stat = cusparse_hyb2csr(cusparsestruct->handle, 928 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 929 temp->values->data().get(), 930 temp->row_offsets->data().get(), 931 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 932 933 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 934 CsrMatrix *tempT= new CsrMatrix; 935 tempT->num_rows = A->rmap->n; 936 tempT->num_cols = A->cmap->n; 937 tempT->num_entries = a->nz; 938 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 939 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 940 tempT->values = new THRUSTARRAY(a->nz); 941 942 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 943 temp->num_cols, temp->num_entries, 944 temp->values->data().get(), 945 temp->row_offsets->data().get(), 946 temp->column_indices->data().get(), 947 tempT->values->data().get(), 948 tempT->column_indices->data().get(), 949 tempT->row_offsets->data().get(), 950 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 951 952 /* Last, convert CSC to HYB */ 953 cusparseHybMat_t hybMat; 954 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 955 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 956 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 957 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 958 matstructT->descr, tempT->values->data().get(), 959 tempT->row_offsets->data().get(), 960 tempT->column_indices->data().get(), 961 hybMat, 0, partition);CHKERRCUSPARSE(stat); 962 963 /* assign the pointer */ 964 matstructT->mat = hybMat; 965 ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr); 966 967 /* delete temporaries */ 968 if (tempT) { 969 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 970 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 971 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 972 delete (CsrMatrix*) tempT; 973 } 974 if (temp) { 975 if (temp->values) delete (THRUSTARRAY*) temp->values; 976 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 977 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 978 delete (CsrMatrix*) temp; 979 } 980 } 981 /* the compressed row indices is not used for matTranspose */ 982 matstructT->cprowIndices = NULL; 983 /* assign the pointer */ 984 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 985 PetscFunctionReturn(0); 986 } 987 988 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 989 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 990 { 991 PetscInt n = xx->map->n; 992 const PetscScalar *barray; 993 PetscScalar *xarray; 994 thrust::device_ptr<const PetscScalar> bGPU; 995 thrust::device_ptr<PetscScalar> xGPU; 996 cusparseStatus_t stat; 997 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 998 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 999 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1000 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1001 PetscErrorCode ierr; 1002 cudaError_t cerr; 1003 1004 PetscFunctionBegin; 1005 /* Analyze the matrix and create the transpose ... on the fly */ 1006 if (!loTriFactorT && !upTriFactorT) { 1007 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1008 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1009 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1010 } 1011 1012 /* Get the GPU pointers */ 1013 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1014 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1015 xGPU = thrust::device_pointer_cast(xarray); 1016 bGPU = thrust::device_pointer_cast(barray); 1017 1018 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1019 /* First, reorder with the row permutation */ 1020 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1021 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1022 xGPU); 1023 1024 /* First, solve U */ 1025 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1026 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1027 upTriFactorT->csrMat->values->data().get(), 1028 upTriFactorT->csrMat->row_offsets->data().get(), 1029 upTriFactorT->csrMat->column_indices->data().get(), 1030 upTriFactorT->solveInfo, 1031 xarray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1032 1033 /* Then, solve L */ 1034 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1035 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1036 loTriFactorT->csrMat->values->data().get(), 1037 loTriFactorT->csrMat->row_offsets->data().get(), 1038 loTriFactorT->csrMat->column_indices->data().get(), 1039 loTriFactorT->solveInfo, 1040 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1041 1042 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1043 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1044 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1045 tempGPU->begin()); 1046 1047 /* Copy the temporary to the full solution. */ 1048 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1049 1050 /* restore */ 1051 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1052 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1053 cerr = WaitForGPU();CHKERRCUDA(cerr); 1054 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1055 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1056 PetscFunctionReturn(0); 1057 } 1058 1059 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1060 { 1061 const PetscScalar *barray; 1062 PetscScalar *xarray; 1063 cusparseStatus_t stat; 1064 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1065 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1066 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1067 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1068 PetscErrorCode ierr; 1069 cudaError_t cerr; 1070 1071 PetscFunctionBegin; 1072 /* Analyze the matrix and create the transpose ... on the fly */ 1073 if (!loTriFactorT && !upTriFactorT) { 1074 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1075 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1076 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1077 } 1078 1079 /* Get the GPU pointers */ 1080 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1081 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1082 1083 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1084 /* First, solve U */ 1085 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1086 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1087 upTriFactorT->csrMat->values->data().get(), 1088 upTriFactorT->csrMat->row_offsets->data().get(), 1089 upTriFactorT->csrMat->column_indices->data().get(), 1090 upTriFactorT->solveInfo, 1091 barray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1092 1093 /* Then, solve L */ 1094 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1095 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1096 loTriFactorT->csrMat->values->data().get(), 1097 loTriFactorT->csrMat->row_offsets->data().get(), 1098 loTriFactorT->csrMat->column_indices->data().get(), 1099 loTriFactorT->solveInfo, 1100 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1101 1102 /* restore */ 1103 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1104 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1105 cerr = WaitForGPU();CHKERRCUDA(cerr); 1106 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1107 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1108 PetscFunctionReturn(0); 1109 } 1110 1111 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1112 { 1113 const PetscScalar *barray; 1114 PetscScalar *xarray; 1115 thrust::device_ptr<const PetscScalar> bGPU; 1116 thrust::device_ptr<PetscScalar> xGPU; 1117 cusparseStatus_t stat; 1118 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1119 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1120 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1121 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1122 PetscErrorCode ierr; 1123 cudaError_t cerr; 1124 1125 PetscFunctionBegin; 1126 1127 /* Get the GPU pointers */ 1128 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1129 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1130 xGPU = thrust::device_pointer_cast(xarray); 1131 bGPU = thrust::device_pointer_cast(barray); 1132 1133 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1134 /* First, reorder with the row permutation */ 1135 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1136 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1137 tempGPU->begin()); 1138 1139 /* Next, solve L */ 1140 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1141 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1142 loTriFactor->csrMat->values->data().get(), 1143 loTriFactor->csrMat->row_offsets->data().get(), 1144 loTriFactor->csrMat->column_indices->data().get(), 1145 loTriFactor->solveInfo, 1146 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1147 1148 /* Then, solve U */ 1149 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1150 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1151 upTriFactor->csrMat->values->data().get(), 1152 upTriFactor->csrMat->row_offsets->data().get(), 1153 upTriFactor->csrMat->column_indices->data().get(), 1154 upTriFactor->solveInfo, 1155 xarray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1156 1157 /* Last, reorder with the column permutation */ 1158 thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1159 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1160 xGPU); 1161 1162 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1163 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1164 cerr = WaitForGPU();CHKERRCUDA(cerr); 1165 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1166 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1167 PetscFunctionReturn(0); 1168 } 1169 1170 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1171 { 1172 const PetscScalar *barray; 1173 PetscScalar *xarray; 1174 cusparseStatus_t stat; 1175 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1176 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1177 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1178 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1179 PetscErrorCode ierr; 1180 cudaError_t cerr; 1181 1182 PetscFunctionBegin; 1183 /* Get the GPU pointers */ 1184 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1185 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1186 1187 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1188 /* First, solve L */ 1189 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1190 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1191 loTriFactor->csrMat->values->data().get(), 1192 loTriFactor->csrMat->row_offsets->data().get(), 1193 loTriFactor->csrMat->column_indices->data().get(), 1194 loTriFactor->solveInfo, 1195 barray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1196 1197 /* Next, solve U */ 1198 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1199 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1200 upTriFactor->csrMat->values->data().get(), 1201 upTriFactor->csrMat->row_offsets->data().get(), 1202 upTriFactor->csrMat->column_indices->data().get(), 1203 upTriFactor->solveInfo, 1204 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1205 1206 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1207 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1208 cerr = WaitForGPU();CHKERRCUDA(cerr); 1209 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1210 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1211 PetscFunctionReturn(0); 1212 } 1213 1214 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1215 { 1216 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1217 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1218 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1219 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1220 PetscErrorCode ierr; 1221 cusparseStatus_t stat; 1222 cudaError_t err; 1223 1224 PetscFunctionBegin; 1225 if (A->boundtocpu) PetscFunctionReturn(0); 1226 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1227 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1228 if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1229 /* Copy values only */ 1230 CsrMatrix *mat,*matT; 1231 mat = (CsrMatrix*)cusparsestruct->mat->mat; 1232 mat->values->assign(a->a, a->a+a->nz); 1233 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1234 1235 /* Update matT when it was built before */ 1236 if (cusparsestruct->matTranspose) { 1237 cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr); 1238 matT = (CsrMatrix*)cusparsestruct->matTranspose->mat; 1239 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1240 A->cmap->n, mat->num_entries, 1241 mat->values->data().get(), 1242 cusparsestruct->rowoffsets_gpu->data().get(), 1243 mat->column_indices->data().get(), 1244 matT->values->data().get(), 1245 matT->column_indices->data().get(), 1246 matT->row_offsets->data().get(), 1247 CUSPARSE_ACTION_NUMERIC,indexBase);CHKERRCUSPARSE(stat); 1248 } 1249 } else { 1250 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1251 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr); 1252 delete cusparsestruct->workVector; 1253 delete cusparsestruct->rowoffsets_gpu; 1254 try { 1255 if (a->compressedrow.use) { 1256 m = a->compressedrow.nrows; 1257 ii = a->compressedrow.i; 1258 ridx = a->compressedrow.rindex; 1259 } else { 1260 m = A->rmap->n; 1261 ii = a->i; 1262 ridx = NULL; /* In this case, one should not dereference ridx */ 1263 } 1264 cusparsestruct->nrows = m; 1265 1266 /* allocate space for the triangular factor information */ 1267 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1268 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1269 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1270 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1271 1272 err = cudaMalloc((void **)&(matstruct->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 1273 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1274 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1275 err = cudaMemcpy(matstruct->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1276 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1277 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1278 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1279 1280 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1281 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1282 /* set the matrix */ 1283 CsrMatrix *matrix= new CsrMatrix; 1284 matrix->num_rows = m; 1285 matrix->num_cols = A->cmap->n; 1286 matrix->num_entries = a->nz; 1287 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1288 matrix->row_offsets->assign(ii, ii + m+1); 1289 1290 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1291 matrix->column_indices->assign(a->j, a->j+a->nz); 1292 1293 matrix->values = new THRUSTARRAY(a->nz); 1294 matrix->values->assign(a->a, a->a+a->nz); 1295 1296 /* assign the pointer */ 1297 matstruct->mat = matrix; 1298 1299 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1300 CsrMatrix *matrix= new CsrMatrix; 1301 matrix->num_rows = m; 1302 matrix->num_cols = A->cmap->n; 1303 matrix->num_entries = a->nz; 1304 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1305 matrix->row_offsets->assign(ii, ii + m+1); 1306 1307 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1308 matrix->column_indices->assign(a->j, a->j+a->nz); 1309 1310 matrix->values = new THRUSTARRAY(a->nz); 1311 matrix->values->assign(a->a, a->a+a->nz); 1312 1313 cusparseHybMat_t hybMat; 1314 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1315 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1316 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1317 stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1318 matstruct->descr, matrix->values->data().get(), 1319 matrix->row_offsets->data().get(), 1320 matrix->column_indices->data().get(), 1321 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1322 /* assign the pointer */ 1323 matstruct->mat = hybMat; 1324 1325 if (matrix) { 1326 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1327 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1328 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1329 delete (CsrMatrix*)matrix; 1330 } 1331 } 1332 1333 /* assign the compressed row indices */ 1334 if (a->compressedrow.use) { 1335 cusparsestruct->workVector = new THRUSTARRAY(m); 1336 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1337 matstruct->cprowIndices->assign(ridx,ridx+m); 1338 tmp = m; 1339 } else { 1340 cusparsestruct->workVector = NULL; 1341 matstruct->cprowIndices = NULL; 1342 tmp = 0; 1343 } 1344 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1345 1346 /* assign the pointer */ 1347 cusparsestruct->mat = matstruct; 1348 } catch(char *ex) { 1349 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1350 } 1351 cusparsestruct->nonzerostate = A->nonzerostate; 1352 } 1353 err = WaitForGPU();CHKERRCUDA(err); 1354 A->offloadmask = PETSC_OFFLOAD_BOTH; 1355 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1356 } 1357 PetscFunctionReturn(0); 1358 } 1359 1360 struct VecCUDAPlusEquals 1361 { 1362 template <typename Tuple> 1363 __host__ __device__ 1364 void operator()(Tuple t) 1365 { 1366 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1367 } 1368 }; 1369 1370 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1371 { 1372 PetscErrorCode ierr; 1373 1374 PetscFunctionBegin; 1375 ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr); 1376 PetscFunctionReturn(0); 1377 } 1378 1379 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1380 { 1381 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1382 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1383 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1384 const PetscScalar *xarray; 1385 PetscScalar *yarray; 1386 PetscErrorCode ierr; 1387 cudaError_t cerr; 1388 cusparseStatus_t stat; 1389 1390 PetscFunctionBegin; 1391 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1392 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1393 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1394 if (!matstructT) { 1395 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1396 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1397 } 1398 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1399 ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1400 if (yy->map->n) { 1401 PetscInt n = yy->map->n; 1402 cudaError_t err; 1403 err = cudaMemset(yarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err); /* hack to fix numerical errors from reading output vector yy, apparently */ 1404 } 1405 1406 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1407 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1408 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1409 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1410 mat->num_rows, mat->num_cols, 1411 mat->num_entries, matstructT->alpha, matstructT->descr, 1412 mat->values->data().get(), mat->row_offsets->data().get(), 1413 mat->column_indices->data().get(), xarray, matstructT->beta_zero, 1414 yarray);CHKERRCUSPARSE(stat); 1415 } else { 1416 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1417 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1418 matstructT->alpha, matstructT->descr, hybMat, 1419 xarray, matstructT->beta_zero, 1420 yarray);CHKERRCUSPARSE(stat); 1421 } 1422 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1423 ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1424 cerr = WaitForGPU();CHKERRCUDA(cerr); 1425 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1426 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1427 PetscFunctionReturn(0); 1428 } 1429 1430 1431 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1432 { 1433 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1434 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1435 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 1436 const PetscScalar *xarray; 1437 PetscScalar *zarray,*dptr,*beta; 1438 PetscErrorCode ierr; 1439 cudaError_t cerr; 1440 cusparseStatus_t stat; 1441 PetscBool compressed = a->compressedrow.use; /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 1442 1443 PetscFunctionBegin; 1444 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1445 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1446 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1447 try { 1448 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1449 1450 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 1451 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 1452 1453 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; /* Use a work vector for SpMv result if compressed */ 1454 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 1455 1456 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1457 /* csr_spmv does y = alpha*Ax + beta*y */ 1458 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1459 /* here we need to be careful to set the number of rows in the multiply to the 1460 number of compressed rows in the matrix ... which is equivalent to the 1461 size of the workVector */ 1462 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1463 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1464 mat->num_rows, mat->num_cols, 1465 mat->num_entries, matstruct->alpha, matstruct->descr, 1466 mat->values->data().get(), mat->row_offsets->data().get(), 1467 mat->column_indices->data().get(), xarray, beta, 1468 dptr);CHKERRCUSPARSE(stat); 1469 } else { 1470 if (cusparsestruct->nrows) { 1471 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1472 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1473 matstruct->alpha, matstruct->descr, hybMat, 1474 xarray, beta, 1475 dptr);CHKERRCUSPARSE(stat); 1476 } 1477 } 1478 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1479 1480 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 1481 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 1482 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 1483 } else if (zz != yy) { /* A's not compreseed. zz already contains A*xx, and we just need to add yy */ 1484 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 1485 } 1486 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 1487 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 1488 } 1489 1490 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 1491 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1492 if (compressed) { 1493 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 1494 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 1495 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1496 VecCUDAPlusEquals()); 1497 } 1498 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1499 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1500 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 1501 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 1502 } catch(char *ex) { 1503 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1504 } 1505 cerr = WaitForGPU();CHKERRCUDA(cerr); 1506 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1507 PetscFunctionReturn(0); 1508 } 1509 1510 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1511 { 1512 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1513 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1514 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1515 const PetscScalar *xarray; 1516 PetscScalar *zarray,*beta; 1517 PetscErrorCode ierr; 1518 cudaError_t cerr; 1519 cusparseStatus_t stat; 1520 1521 PetscFunctionBegin; 1522 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1523 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1524 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1525 if (!matstructT) { 1526 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1527 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1528 } 1529 1530 /* Note unlike Mat, MatTranspose uses non-compressed row storage */ 1531 try { 1532 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1533 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1534 ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr); 1535 beta = (yy == zz) ? matstructT->beta_one : matstructT->beta_zero; 1536 1537 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1538 /* multiply add with matrix transpose */ 1539 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1540 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1541 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1542 mat->num_rows, mat->num_cols, 1543 mat->num_entries, matstructT->alpha, matstructT->descr, 1544 mat->values->data().get(), mat->row_offsets->data().get(), 1545 mat->column_indices->data().get(), xarray, beta, 1546 zarray);CHKERRCUSPARSE(stat); 1547 } else { 1548 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1549 if (cusparsestruct->nrows) { 1550 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1551 matstructT->alpha, matstructT->descr, hybMat, 1552 xarray, beta, 1553 zarray);CHKERRCUSPARSE(stat); 1554 } 1555 } 1556 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1557 1558 if (zz != yy) {ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr);} 1559 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1560 ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr); 1561 } catch(char *ex) { 1562 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1563 } 1564 cerr = WaitForGPU();CHKERRCUDA(cerr); 1565 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1566 PetscFunctionReturn(0); 1567 } 1568 1569 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1570 { 1571 PetscErrorCode ierr; 1572 1573 PetscFunctionBegin; 1574 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1575 if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 1576 if (A->factortype == MAT_FACTOR_NONE) { 1577 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1578 } 1579 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1580 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1581 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1582 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1583 PetscFunctionReturn(0); 1584 } 1585 1586 /* --------------------------------------------------------------------------------*/ 1587 /*@ 1588 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1589 (the default parallel PETSc format). This matrix will ultimately pushed down 1590 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1591 assembly performance the user should preallocate the matrix storage by setting 1592 the parameter nz (or the array nnz). By setting these parameters accurately, 1593 performance during matrix assembly can be increased by more than a factor of 50. 1594 1595 Collective 1596 1597 Input Parameters: 1598 + comm - MPI communicator, set to PETSC_COMM_SELF 1599 . m - number of rows 1600 . n - number of columns 1601 . nz - number of nonzeros per row (same for all rows) 1602 - nnz - array containing the number of nonzeros in the various rows 1603 (possibly different for each row) or NULL 1604 1605 Output Parameter: 1606 . A - the matrix 1607 1608 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1609 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1610 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1611 1612 Notes: 1613 If nnz is given then nz is ignored 1614 1615 The AIJ format (also called the Yale sparse matrix format or 1616 compressed row storage), is fully compatible with standard Fortran 77 1617 storage. That is, the stored row and column indices can begin at 1618 either one (as in Fortran) or zero. See the users' manual for details. 1619 1620 Specify the preallocated storage with either nz or nnz (not both). 1621 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1622 allocation. For large problems you MUST preallocate memory or you 1623 will get TERRIBLE performance, see the users' manual chapter on matrices. 1624 1625 By default, this format uses inodes (identical nodes) when possible, to 1626 improve numerical efficiency of matrix-vector products and solves. We 1627 search for consecutive rows with the same nonzero structure, thereby 1628 reusing matrix information to achieve increased efficiency. 1629 1630 Level: intermediate 1631 1632 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1633 @*/ 1634 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1635 { 1636 PetscErrorCode ierr; 1637 1638 PetscFunctionBegin; 1639 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1640 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1641 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1642 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1643 PetscFunctionReturn(0); 1644 } 1645 1646 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1647 { 1648 PetscErrorCode ierr; 1649 1650 PetscFunctionBegin; 1651 if (A->factortype==MAT_FACTOR_NONE) { 1652 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) { 1653 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 1654 } 1655 } else { 1656 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 1657 } 1658 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1659 PetscFunctionReturn(0); 1660 } 1661 1662 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 1663 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 1664 { 1665 PetscErrorCode ierr; 1666 Mat C; 1667 cusparseStatus_t stat; 1668 cusparseHandle_t handle=0; 1669 1670 PetscFunctionBegin; 1671 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 1672 C = *B; 1673 ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr); 1674 ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr); 1675 1676 /* inject CUSPARSE-specific stuff */ 1677 if (C->factortype==MAT_FACTOR_NONE) { 1678 /* you cannot check the inode.use flag here since the matrix was just created. 1679 now build a GPU matrix data structure */ 1680 C->spptr = new Mat_SeqAIJCUSPARSE; 1681 ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat = 0; 1682 ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0; 1683 ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector = 0; 1684 ((Mat_SeqAIJCUSPARSE*)C->spptr)->rowoffsets_gpu = 0; 1685 ((Mat_SeqAIJCUSPARSE*)C->spptr)->format = MAT_CUSPARSE_CSR; 1686 ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream = 0; 1687 stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat); 1688 ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle = handle; 1689 ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0; 1690 } else { 1691 /* NEXT, set the pointers to the triangular factors */ 1692 C->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1693 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr = 0; 1694 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr = 0; 1695 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0; 1696 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0; 1697 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices = 0; 1698 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices = 0; 1699 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector = 0; 1700 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = 0; 1701 stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat); 1702 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = handle; 1703 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz = 0; 1704 } 1705 1706 C->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1707 C->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1708 C->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1709 C->ops->mult = MatMult_SeqAIJCUSPARSE; 1710 C->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1711 C->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1712 C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1713 C->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1714 C->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 1715 1716 ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1717 1718 C->boundtocpu = PETSC_FALSE; 1719 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 1720 1721 ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1722 PetscFunctionReturn(0); 1723 } 1724 1725 1726 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 1727 { 1728 PetscFunctionBegin; 1729 /* Currently, there is no case in which an AIJCUSPARSE matrix ever has its offloadmask set to PETS_OFFLOAD_GPU. 1730 If this changes, we need to implement a routine to update the CPU (host) version of the matrix from the GPU one. 1731 Right now, for safety we simply check for PETSC_OFFLOAD_GPU and have MatBindToCPU() do nothing in this case. 1732 TODO: Add MatAIJCUSPARSECopyFromGPU() and make MatBindToCPU() functional for AIJCUSPARSE matries; 1733 can follow the example of MatBindToCPU_SeqAIJViennaCL(). */ 1734 if (A->offloadmask == PETSC_OFFLOAD_GPU) PetscFunctionReturn(0); 1735 if (flg) { 1736 A->ops->mult = MatMult_SeqAIJ; 1737 A->ops->multadd = MatMultAdd_SeqAIJ; 1738 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 1739 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 1740 A->ops->assemblyend = MatAssemblyEnd_SeqAIJ; 1741 A->ops->duplicate = MatDuplicate_SeqAIJ; 1742 } else { 1743 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1744 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1745 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1746 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1747 A->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1748 A->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1749 A->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1750 } 1751 A->boundtocpu = flg; 1752 PetscFunctionReturn(0); 1753 } 1754 1755 PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B) 1756 { 1757 PetscErrorCode ierr; 1758 cusparseStatus_t stat; 1759 cusparseHandle_t handle=0; 1760 1761 PetscFunctionBegin; 1762 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 1763 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 1764 1765 if (B->factortype==MAT_FACTOR_NONE) { 1766 /* you cannot check the inode.use flag here since the matrix was just created. 1767 now build a GPU matrix data structure */ 1768 B->spptr = new Mat_SeqAIJCUSPARSE; 1769 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1770 ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0; 1771 ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0; 1772 ((Mat_SeqAIJCUSPARSE*)B->spptr)->rowoffsets_gpu = 0; 1773 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1774 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1775 stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat); 1776 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle; 1777 ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0; 1778 } else { 1779 /* NEXT, set the pointers to the triangular factors */ 1780 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1781 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 1782 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1783 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0; 1784 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0; 1785 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0; 1786 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0; 1787 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0; 1788 stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat); 1789 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle; 1790 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0; 1791 } 1792 1793 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1794 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1795 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1796 B->ops->mult = MatMult_SeqAIJCUSPARSE; 1797 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1798 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1799 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1800 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1801 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 1802 1803 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1804 1805 B->boundtocpu = PETSC_FALSE; 1806 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 1807 1808 ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1809 PetscFunctionReturn(0); 1810 } 1811 1812 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 1813 { 1814 PetscErrorCode ierr; 1815 1816 PetscFunctionBegin; 1817 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 1818 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);CHKERRQ(ierr); 1819 PetscFunctionReturn(0); 1820 } 1821 1822 /*MC 1823 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1824 1825 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1826 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 1827 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 1828 1829 Options Database Keys: 1830 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1831 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1832 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1833 1834 Level: beginner 1835 1836 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1837 M*/ 1838 1839 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 1840 1841 1842 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 1843 { 1844 PetscErrorCode ierr; 1845 1846 PetscFunctionBegin; 1847 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1848 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1849 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1850 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1851 PetscFunctionReturn(0); 1852 } 1853 1854 1855 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 1856 { 1857 cusparseStatus_t stat; 1858 cusparseHandle_t handle; 1859 1860 PetscFunctionBegin; 1861 if (*cusparsestruct) { 1862 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format); 1863 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format); 1864 delete (*cusparsestruct)->workVector; 1865 delete (*cusparsestruct)->rowoffsets_gpu; 1866 if (handle = (*cusparsestruct)->handle) { 1867 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 1868 } 1869 delete *cusparsestruct; 1870 *cusparsestruct = 0; 1871 } 1872 PetscFunctionReturn(0); 1873 } 1874 1875 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 1876 { 1877 PetscFunctionBegin; 1878 if (*mat) { 1879 delete (*mat)->values; 1880 delete (*mat)->column_indices; 1881 delete (*mat)->row_offsets; 1882 delete *mat; 1883 *mat = 0; 1884 } 1885 PetscFunctionReturn(0); 1886 } 1887 1888 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 1889 { 1890 cusparseStatus_t stat; 1891 PetscErrorCode ierr; 1892 1893 PetscFunctionBegin; 1894 if (*trifactor) { 1895 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 1896 if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 1897 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 1898 delete *trifactor; 1899 *trifactor = 0; 1900 } 1901 PetscFunctionReturn(0); 1902 } 1903 1904 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 1905 { 1906 CsrMatrix *mat; 1907 cusparseStatus_t stat; 1908 cudaError_t err; 1909 1910 PetscFunctionBegin; 1911 if (*matstruct) { 1912 if ((*matstruct)->mat) { 1913 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 1914 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 1915 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 1916 } else { 1917 mat = (CsrMatrix*)(*matstruct)->mat; 1918 CsrMatrix_Destroy(&mat); 1919 } 1920 } 1921 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 1922 delete (*matstruct)->cprowIndices; 1923 if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); } 1924 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 1925 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 1926 delete *matstruct; 1927 *matstruct = 0; 1928 } 1929 PetscFunctionReturn(0); 1930 } 1931 1932 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 1933 { 1934 cusparseHandle_t handle; 1935 cusparseStatus_t stat; 1936 1937 PetscFunctionBegin; 1938 if (*trifactors) { 1939 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr); 1940 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr); 1941 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose); 1942 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose); 1943 delete (*trifactors)->rpermIndices; 1944 delete (*trifactors)->cpermIndices; 1945 delete (*trifactors)->workVector; 1946 if (handle = (*trifactors)->handle) { 1947 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 1948 } 1949 delete *trifactors; 1950 *trifactors = 0; 1951 } 1952 PetscFunctionReturn(0); 1953 } 1954 1955