1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_CXX_COMPLEX_FIX 7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 8 9 #include <petscconf.h> 10 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11 #include <../src/mat/impls/sbaij/seq/sbaij.h> 12 #include <../src/vec/vec/impls/dvecimpl.h> 13 #include <petsc/private/vecimpl.h> 14 #undef VecType 15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 19 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 20 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 21 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 22 23 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 24 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 25 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 26 27 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 28 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 29 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 30 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 31 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 32 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 33 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 34 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 35 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 36 37 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 38 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 39 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 40 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 41 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 42 43 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 44 { 45 cusparseStatus_t stat; 46 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 47 48 PetscFunctionBegin; 49 cusparsestruct->stream = stream; 50 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat); 51 PetscFunctionReturn(0); 52 } 53 54 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 55 { 56 cusparseStatus_t stat; 57 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 58 59 PetscFunctionBegin; 60 if (cusparsestruct->handle != handle) { 61 if (cusparsestruct->handle) { 62 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat); 63 } 64 cusparsestruct->handle = handle; 65 } 66 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 67 PetscFunctionReturn(0); 68 } 69 70 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 71 { 72 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 73 PetscFunctionBegin; 74 if (cusparsestruct->handle) 75 cusparsestruct->handle = 0; 76 PetscFunctionReturn(0); 77 } 78 79 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 80 { 81 PetscFunctionBegin; 82 *type = MATSOLVERCUSPARSE; 83 PetscFunctionReturn(0); 84 } 85 86 /*MC 87 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 88 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 89 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 90 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 91 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 92 algorithms are not recommended. This class does NOT support direct solver operations. 93 94 Level: beginner 95 96 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 97 M*/ 98 99 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 100 { 101 PetscErrorCode ierr; 102 PetscInt n = A->rmap->n; 103 104 PetscFunctionBegin; 105 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 106 (*B)->factortype = ftype; 107 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 108 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 109 110 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 111 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 112 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 113 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 114 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 115 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 116 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 117 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 118 119 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 120 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 121 PetscFunctionReturn(0); 122 } 123 124 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 125 { 126 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 127 128 PetscFunctionBegin; 129 switch (op) { 130 case MAT_CUSPARSE_MULT: 131 cusparsestruct->format = format; 132 break; 133 case MAT_CUSPARSE_ALL: 134 cusparsestruct->format = format; 135 break; 136 default: 137 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 138 } 139 PetscFunctionReturn(0); 140 } 141 142 /*@ 143 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 144 operation. Only the MatMult operation can use different GPU storage formats 145 for MPIAIJCUSPARSE matrices. 146 Not Collective 147 148 Input Parameters: 149 + A - Matrix of type SEQAIJCUSPARSE 150 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 151 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 152 153 Output Parameter: 154 155 Level: intermediate 156 157 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 158 @*/ 159 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 160 { 161 PetscErrorCode ierr; 162 163 PetscFunctionBegin; 164 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 165 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 166 PetscFunctionReturn(0); 167 } 168 169 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 170 { 171 PetscErrorCode ierr; 172 MatCUSPARSEStorageFormat format; 173 PetscBool flg; 174 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 175 176 PetscFunctionBegin; 177 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 178 if (A->factortype==MAT_FACTOR_NONE) { 179 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 180 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 181 if (flg) { 182 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 183 } 184 } 185 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 186 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 187 if (flg) { 188 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 189 } 190 ierr = PetscOptionsTail();CHKERRQ(ierr); 191 PetscFunctionReturn(0); 192 193 } 194 195 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 196 { 197 PetscErrorCode ierr; 198 199 PetscFunctionBegin; 200 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 201 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 202 PetscFunctionReturn(0); 203 } 204 205 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 206 { 207 PetscErrorCode ierr; 208 209 PetscFunctionBegin; 210 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 211 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 212 PetscFunctionReturn(0); 213 } 214 215 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 216 { 217 PetscErrorCode ierr; 218 219 PetscFunctionBegin; 220 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 221 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 222 PetscFunctionReturn(0); 223 } 224 225 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 226 { 227 PetscErrorCode ierr; 228 229 PetscFunctionBegin; 230 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 231 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 232 PetscFunctionReturn(0); 233 } 234 235 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 236 { 237 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 238 PetscInt n = A->rmap->n; 239 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 240 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 241 cusparseStatus_t stat; 242 const PetscInt *ai = a->i,*aj = a->j,*vi; 243 const MatScalar *aa = a->a,*v; 244 PetscInt *AiLo, *AjLo; 245 PetscScalar *AALo; 246 PetscInt i,nz, nzLower, offset, rowOffset; 247 PetscErrorCode ierr; 248 249 PetscFunctionBegin; 250 if (!n) PetscFunctionReturn(0); 251 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 252 try { 253 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 254 nzLower=n+ai[n]-ai[1]; 255 256 /* Allocate Space for the lower triangular matrix */ 257 ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 258 ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr); 259 ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr); 260 261 /* Fill the lower triangular matrix */ 262 AiLo[0] = (PetscInt) 0; 263 AiLo[n] = nzLower; 264 AjLo[0] = (PetscInt) 0; 265 AALo[0] = (MatScalar) 1.0; 266 v = aa; 267 vi = aj; 268 offset = 1; 269 rowOffset= 1; 270 for (i=1; i<n; i++) { 271 nz = ai[i+1] - ai[i]; 272 /* additional 1 for the term on the diagonal */ 273 AiLo[i] = rowOffset; 274 rowOffset += nz+1; 275 276 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 277 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 278 279 offset += nz; 280 AjLo[offset] = (PetscInt) i; 281 AALo[offset] = (MatScalar) 1.0; 282 offset += 1; 283 284 v += nz; 285 vi += nz; 286 } 287 288 /* allocate space for the triangular factor information */ 289 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 290 291 /* Create the matrix description */ 292 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 293 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 294 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 295 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat); 296 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 297 298 /* Create the solve analysis information */ 299 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 300 301 /* set the operation */ 302 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 303 304 /* set the matrix */ 305 loTriFactor->csrMat = new CsrMatrix; 306 loTriFactor->csrMat->num_rows = n; 307 loTriFactor->csrMat->num_cols = n; 308 loTriFactor->csrMat->num_entries = nzLower; 309 310 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 311 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 312 313 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 314 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 315 316 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 317 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 318 319 /* perform the solve analysis */ 320 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 321 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 322 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 323 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 324 325 /* assign the pointer. Is this really necessary? */ 326 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 327 328 ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr); 329 ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr); 330 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 331 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 332 } catch(char *ex) { 333 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 334 } 335 } 336 PetscFunctionReturn(0); 337 } 338 339 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 340 { 341 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 342 PetscInt n = A->rmap->n; 343 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 344 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 345 cusparseStatus_t stat; 346 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 347 const MatScalar *aa = a->a,*v; 348 PetscInt *AiUp, *AjUp; 349 PetscScalar *AAUp; 350 PetscInt i,nz, nzUpper, offset; 351 PetscErrorCode ierr; 352 353 PetscFunctionBegin; 354 if (!n) PetscFunctionReturn(0); 355 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 356 try { 357 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 358 nzUpper = adiag[0]-adiag[n]; 359 360 /* Allocate Space for the upper triangular matrix */ 361 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 362 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 363 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 364 365 /* Fill the upper triangular matrix */ 366 AiUp[0]=(PetscInt) 0; 367 AiUp[n]=nzUpper; 368 offset = nzUpper; 369 for (i=n-1; i>=0; i--) { 370 v = aa + adiag[i+1] + 1; 371 vi = aj + adiag[i+1] + 1; 372 373 /* number of elements NOT on the diagonal */ 374 nz = adiag[i] - adiag[i+1]-1; 375 376 /* decrement the offset */ 377 offset -= (nz+1); 378 379 /* first, set the diagonal elements */ 380 AjUp[offset] = (PetscInt) i; 381 AAUp[offset] = (MatScalar)1./v[nz]; 382 AiUp[i] = AiUp[i+1] - (nz+1); 383 384 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 385 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 386 } 387 388 /* allocate space for the triangular factor information */ 389 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 390 391 /* Create the matrix description */ 392 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 393 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 394 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 395 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 396 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 397 398 /* Create the solve analysis information */ 399 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 400 401 /* set the operation */ 402 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 403 404 /* set the matrix */ 405 upTriFactor->csrMat = new CsrMatrix; 406 upTriFactor->csrMat->num_rows = n; 407 upTriFactor->csrMat->num_cols = n; 408 upTriFactor->csrMat->num_entries = nzUpper; 409 410 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 411 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 412 413 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 414 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 415 416 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 417 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 418 419 /* perform the solve analysis */ 420 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 421 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 422 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 423 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 424 425 /* assign the pointer. Is this really necessary? */ 426 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 427 428 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 429 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 430 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 431 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 432 } catch(char *ex) { 433 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 434 } 435 } 436 PetscFunctionReturn(0); 437 } 438 439 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 440 { 441 PetscErrorCode ierr; 442 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 443 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 444 IS isrow = a->row,iscol = a->icol; 445 PetscBool row_identity,col_identity; 446 const PetscInt *r,*c; 447 PetscInt n = A->rmap->n; 448 449 PetscFunctionBegin; 450 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 451 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 452 453 cusparseTriFactors->workVector = new THRUSTARRAY(n); 454 cusparseTriFactors->nnz=a->nz; 455 456 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 457 /* lower triangular indices */ 458 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 459 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 460 if (!row_identity) { 461 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 462 cusparseTriFactors->rpermIndices->assign(r, r+n); 463 } 464 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 465 466 /* upper triangular indices */ 467 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 468 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 469 if (!col_identity) { 470 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 471 cusparseTriFactors->cpermIndices->assign(c, c+n); 472 } 473 474 if (!row_identity && !col_identity) { 475 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 476 } else if(!row_identity) { 477 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 478 } else if(!col_identity) { 479 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 480 } 481 482 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 483 PetscFunctionReturn(0); 484 } 485 486 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 487 { 488 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 489 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 490 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 491 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 492 cusparseStatus_t stat; 493 PetscErrorCode ierr; 494 PetscInt *AiUp, *AjUp; 495 PetscScalar *AAUp; 496 PetscScalar *AALo; 497 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 498 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 499 const PetscInt *ai = b->i,*aj = b->j,*vj; 500 const MatScalar *aa = b->a,*v; 501 502 PetscFunctionBegin; 503 if (!n) PetscFunctionReturn(0); 504 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 505 try { 506 /* Allocate Space for the upper triangular matrix */ 507 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 508 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 509 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 510 ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 511 512 /* Fill the upper triangular matrix */ 513 AiUp[0]=(PetscInt) 0; 514 AiUp[n]=nzUpper; 515 offset = 0; 516 for (i=0; i<n; i++) { 517 /* set the pointers */ 518 v = aa + ai[i]; 519 vj = aj + ai[i]; 520 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 521 522 /* first, set the diagonal elements */ 523 AjUp[offset] = (PetscInt) i; 524 AAUp[offset] = (MatScalar)1.0/v[nz]; 525 AiUp[i] = offset; 526 AALo[offset] = (MatScalar)1.0/v[nz]; 527 528 offset+=1; 529 if (nz>0) { 530 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 531 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 532 for (j=offset; j<offset+nz; j++) { 533 AAUp[j] = -AAUp[j]; 534 AALo[j] = AAUp[j]/v[nz]; 535 } 536 offset+=nz; 537 } 538 } 539 540 /* allocate space for the triangular factor information */ 541 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 542 543 /* Create the matrix description */ 544 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 545 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 546 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 547 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 548 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 549 550 /* Create the solve analysis information */ 551 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 552 553 /* set the operation */ 554 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 555 556 /* set the matrix */ 557 upTriFactor->csrMat = new CsrMatrix; 558 upTriFactor->csrMat->num_rows = A->rmap->n; 559 upTriFactor->csrMat->num_cols = A->cmap->n; 560 upTriFactor->csrMat->num_entries = a->nz; 561 562 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 563 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 564 565 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 566 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 567 568 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 569 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 570 571 /* perform the solve analysis */ 572 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 573 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 574 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 575 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 576 577 /* assign the pointer. Is this really necessary? */ 578 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 579 580 /* allocate space for the triangular factor information */ 581 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 582 583 /* Create the matrix description */ 584 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 585 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 586 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 587 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 588 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 589 590 /* Create the solve analysis information */ 591 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 592 593 /* set the operation */ 594 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 595 596 /* set the matrix */ 597 loTriFactor->csrMat = new CsrMatrix; 598 loTriFactor->csrMat->num_rows = A->rmap->n; 599 loTriFactor->csrMat->num_cols = A->cmap->n; 600 loTriFactor->csrMat->num_entries = a->nz; 601 602 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 603 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 604 605 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 606 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 607 608 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 609 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 610 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 611 612 /* perform the solve analysis */ 613 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 614 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 615 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 616 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 617 618 /* assign the pointer. Is this really necessary? */ 619 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 620 621 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 622 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 623 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 624 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 625 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 626 } catch(char *ex) { 627 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 628 } 629 } 630 PetscFunctionReturn(0); 631 } 632 633 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 634 { 635 PetscErrorCode ierr; 636 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 637 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 638 IS ip = a->row; 639 const PetscInt *rip; 640 PetscBool perm_identity; 641 PetscInt n = A->rmap->n; 642 643 PetscFunctionBegin; 644 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 645 cusparseTriFactors->workVector = new THRUSTARRAY(n); 646 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 647 648 /* lower triangular indices */ 649 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 650 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 651 if (!perm_identity) { 652 IS iip; 653 const PetscInt *irip; 654 655 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 656 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 657 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 658 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 659 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 660 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 661 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 662 ierr = ISDestroy(&iip);CHKERRQ(ierr); 663 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 664 } 665 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 666 PetscFunctionReturn(0); 667 } 668 669 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 670 { 671 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 672 IS isrow = b->row,iscol = b->col; 673 PetscBool row_identity,col_identity; 674 PetscErrorCode ierr; 675 676 PetscFunctionBegin; 677 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 678 /* determine which version of MatSolve needs to be used. */ 679 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 680 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 681 if (row_identity && col_identity) { 682 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 683 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 684 B->ops->matsolve = NULL; 685 B->ops->matsolvetranspose = NULL; 686 } else { 687 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 688 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 689 B->ops->matsolve = NULL; 690 B->ops->matsolvetranspose = NULL; 691 } 692 693 /* get the triangular factors */ 694 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 695 PetscFunctionReturn(0); 696 } 697 698 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 699 { 700 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 701 IS ip = b->row; 702 PetscBool perm_identity; 703 PetscErrorCode ierr; 704 705 PetscFunctionBegin; 706 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 707 708 /* determine which version of MatSolve needs to be used. */ 709 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 710 if (perm_identity) { 711 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 712 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 713 B->ops->matsolve = NULL; 714 B->ops->matsolvetranspose = NULL; 715 } else { 716 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 717 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 718 B->ops->matsolve = NULL; 719 B->ops->matsolvetranspose = NULL; 720 } 721 722 /* get the triangular factors */ 723 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 724 PetscFunctionReturn(0); 725 } 726 727 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 728 { 729 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 730 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 731 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 732 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 733 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 734 cusparseStatus_t stat; 735 cusparseIndexBase_t indexBase; 736 cusparseMatrixType_t matrixType; 737 cusparseFillMode_t fillMode; 738 cusparseDiagType_t diagType; 739 740 PetscFunctionBegin; 741 742 /*********************************************/ 743 /* Now the Transpose of the Lower Tri Factor */ 744 /*********************************************/ 745 746 /* allocate space for the transpose of the lower triangular factor */ 747 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 748 749 /* set the matrix descriptors of the lower triangular factor */ 750 matrixType = cusparseGetMatType(loTriFactor->descr); 751 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 752 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 753 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 754 diagType = cusparseGetMatDiagType(loTriFactor->descr); 755 756 /* Create the matrix description */ 757 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat); 758 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat); 759 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat); 760 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat); 761 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat); 762 763 /* Create the solve analysis information */ 764 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat); 765 766 /* set the operation */ 767 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 768 769 /* allocate GPU space for the CSC of the lower triangular factor*/ 770 loTriFactorT->csrMat = new CsrMatrix; 771 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 772 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 773 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 774 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 775 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 776 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 777 778 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 779 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 780 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 781 loTriFactor->csrMat->values->data().get(), 782 loTriFactor->csrMat->row_offsets->data().get(), 783 loTriFactor->csrMat->column_indices->data().get(), 784 loTriFactorT->csrMat->values->data().get(), 785 loTriFactorT->csrMat->column_indices->data().get(), 786 loTriFactorT->csrMat->row_offsets->data().get(), 787 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 788 789 /* perform the solve analysis on the transposed matrix */ 790 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 791 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 792 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 793 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 794 loTriFactorT->solveInfo);CHKERRCUDA(stat); 795 796 /* assign the pointer. Is this really necessary? */ 797 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 798 799 /*********************************************/ 800 /* Now the Transpose of the Upper Tri Factor */ 801 /*********************************************/ 802 803 /* allocate space for the transpose of the upper triangular factor */ 804 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 805 806 /* set the matrix descriptors of the upper triangular factor */ 807 matrixType = cusparseGetMatType(upTriFactor->descr); 808 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 809 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 810 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 811 diagType = cusparseGetMatDiagType(upTriFactor->descr); 812 813 /* Create the matrix description */ 814 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat); 815 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat); 816 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat); 817 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat); 818 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat); 819 820 /* Create the solve analysis information */ 821 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat); 822 823 /* set the operation */ 824 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 825 826 /* allocate GPU space for the CSC of the upper triangular factor*/ 827 upTriFactorT->csrMat = new CsrMatrix; 828 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 829 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 830 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 831 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 832 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 833 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 834 835 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 836 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 837 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 838 upTriFactor->csrMat->values->data().get(), 839 upTriFactor->csrMat->row_offsets->data().get(), 840 upTriFactor->csrMat->column_indices->data().get(), 841 upTriFactorT->csrMat->values->data().get(), 842 upTriFactorT->csrMat->column_indices->data().get(), 843 upTriFactorT->csrMat->row_offsets->data().get(), 844 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 845 846 /* perform the solve analysis on the transposed matrix */ 847 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 848 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 849 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 850 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 851 upTriFactorT->solveInfo);CHKERRCUDA(stat); 852 853 /* assign the pointer. Is this really necessary? */ 854 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 855 PetscFunctionReturn(0); 856 } 857 858 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 859 { 860 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 861 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 862 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 863 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 864 cusparseStatus_t stat; 865 cusparseIndexBase_t indexBase; 866 cudaError_t err; 867 PetscErrorCode ierr; 868 869 PetscFunctionBegin; 870 871 /* allocate space for the triangular factor information */ 872 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 873 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat); 874 indexBase = cusparseGetMatIndexBase(matstruct->descr); 875 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat); 876 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 877 878 /* set alpha and beta */ 879 err = cudaMalloc((void **)&(matstructT->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 880 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 881 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 882 err = cudaMemcpy(matstructT->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 883 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 884 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 885 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 886 887 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 888 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 889 CsrMatrix *matrixT= new CsrMatrix; 890 thrust::device_vector<int> *rowoffsets_gpu; 891 const int ncomprow = matstruct->cprowIndices->size(); 892 thrust::host_vector<int> cprow(ncomprow); 893 cprow = *matstruct->cprowIndices; // GPU --> CPU 894 matrixT->num_rows = A->cmap->n; 895 matrixT->num_cols = A->rmap->n; 896 matrixT->num_entries = a->nz; 897 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 898 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 899 matrixT->values = new THRUSTARRAY(a->nz); 900 PetscInt k,i; 901 thrust::host_vector<int> rowst_host(A->rmap->n+1); 902 903 /* expand compress rows, which is forced in constructor (MatSeqAIJCUSPARSECopyToGPU) */ 904 rowst_host[0] = 0; 905 for (k = 0, i = 0; i < A->rmap->n ; i++) { 906 if (k < ncomprow && i==cprow[k]) { 907 rowst_host[i+1] = a->i[i+1]; 908 k++; 909 } else rowst_host[i+1] = rowst_host[i]; 910 } 911 if (k!=ncomprow) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"%D != %D",k,ncomprow); 912 rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 913 *rowoffsets_gpu = rowst_host; // CPU --> GPU 914 915 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 916 indexBase = cusparseGetMatIndexBase(matstruct->descr); 917 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 918 A->cmap->n, matrix->num_entries, 919 matrix->values->data().get(), 920 rowoffsets_gpu->data().get(), 921 matrix->column_indices->data().get(), 922 matrixT->values->data().get(), 923 matrixT->column_indices->data().get(), 924 matrixT->row_offsets->data().get(), 925 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 926 927 /* assign the pointer */ 928 matstructT->mat = matrixT; 929 ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 930 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 931 /* First convert HYB to CSR */ 932 CsrMatrix *temp= new CsrMatrix; 933 temp->num_rows = A->rmap->n; 934 temp->num_cols = A->cmap->n; 935 temp->num_entries = a->nz; 936 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 937 temp->column_indices = new THRUSTINTARRAY32(a->nz); 938 temp->values = new THRUSTARRAY(a->nz); 939 940 941 stat = cusparse_hyb2csr(cusparsestruct->handle, 942 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 943 temp->values->data().get(), 944 temp->row_offsets->data().get(), 945 temp->column_indices->data().get());CHKERRCUDA(stat); 946 947 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 948 CsrMatrix *tempT= new CsrMatrix; 949 tempT->num_rows = A->rmap->n; 950 tempT->num_cols = A->cmap->n; 951 tempT->num_entries = a->nz; 952 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 953 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 954 tempT->values = new THRUSTARRAY(a->nz); 955 956 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 957 temp->num_cols, temp->num_entries, 958 temp->values->data().get(), 959 temp->row_offsets->data().get(), 960 temp->column_indices->data().get(), 961 tempT->values->data().get(), 962 tempT->column_indices->data().get(), 963 tempT->row_offsets->data().get(), 964 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 965 966 /* Last, convert CSC to HYB */ 967 cusparseHybMat_t hybMat; 968 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 969 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 970 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 971 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 972 matstructT->descr, tempT->values->data().get(), 973 tempT->row_offsets->data().get(), 974 tempT->column_indices->data().get(), 975 hybMat, 0, partition);CHKERRCUDA(stat); 976 977 /* assign the pointer */ 978 matstructT->mat = hybMat; 979 ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr); 980 981 /* delete temporaries */ 982 if (tempT) { 983 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 984 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 985 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 986 delete (CsrMatrix*) tempT; 987 } 988 if (temp) { 989 if (temp->values) delete (THRUSTARRAY*) temp->values; 990 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 991 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 992 delete (CsrMatrix*) temp; 993 } 994 } 995 /* assign the compressed row indices */ 996 matstructT->cprowIndices = new THRUSTINTARRAY; 997 matstructT->cprowIndices->resize(A->cmap->n); 998 thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end()); 999 /* assign the pointer */ 1000 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1001 PetscFunctionReturn(0); 1002 } 1003 1004 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1005 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1006 { 1007 PetscInt n = xx->map->n; 1008 const PetscScalar *barray; 1009 PetscScalar *xarray; 1010 thrust::device_ptr<const PetscScalar> bGPU; 1011 thrust::device_ptr<PetscScalar> xGPU; 1012 cusparseStatus_t stat; 1013 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1014 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1015 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1016 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1017 PetscErrorCode ierr; 1018 1019 PetscFunctionBegin; 1020 /* Analyze the matrix and create the transpose ... on the fly */ 1021 if (!loTriFactorT && !upTriFactorT) { 1022 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1023 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1024 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1025 } 1026 1027 /* Get the GPU pointers */ 1028 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1029 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1030 xGPU = thrust::device_pointer_cast(xarray); 1031 bGPU = thrust::device_pointer_cast(barray); 1032 1033 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1034 /* First, reorder with the row permutation */ 1035 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1036 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1037 xGPU); 1038 1039 /* First, solve U */ 1040 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1041 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1042 upTriFactorT->csrMat->values->data().get(), 1043 upTriFactorT->csrMat->row_offsets->data().get(), 1044 upTriFactorT->csrMat->column_indices->data().get(), 1045 upTriFactorT->solveInfo, 1046 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1047 1048 /* Then, solve L */ 1049 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1050 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1051 loTriFactorT->csrMat->values->data().get(), 1052 loTriFactorT->csrMat->row_offsets->data().get(), 1053 loTriFactorT->csrMat->column_indices->data().get(), 1054 loTriFactorT->solveInfo, 1055 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1056 1057 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1058 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1059 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1060 tempGPU->begin()); 1061 1062 /* Copy the temporary to the full solution. */ 1063 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1064 1065 /* restore */ 1066 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1067 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1068 ierr = WaitForGPU();CHKERRCUDA(ierr); 1069 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1070 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1071 PetscFunctionReturn(0); 1072 } 1073 1074 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1075 { 1076 const PetscScalar *barray; 1077 PetscScalar *xarray; 1078 cusparseStatus_t stat; 1079 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1080 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1081 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1082 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1083 PetscErrorCode ierr; 1084 1085 PetscFunctionBegin; 1086 /* Analyze the matrix and create the transpose ... on the fly */ 1087 if (!loTriFactorT && !upTriFactorT) { 1088 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1089 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1090 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1091 } 1092 1093 /* Get the GPU pointers */ 1094 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1095 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1096 1097 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1098 /* First, solve U */ 1099 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1100 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1101 upTriFactorT->csrMat->values->data().get(), 1102 upTriFactorT->csrMat->row_offsets->data().get(), 1103 upTriFactorT->csrMat->column_indices->data().get(), 1104 upTriFactorT->solveInfo, 1105 barray, tempGPU->data().get());CHKERRCUDA(stat); 1106 1107 /* Then, solve L */ 1108 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1109 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1110 loTriFactorT->csrMat->values->data().get(), 1111 loTriFactorT->csrMat->row_offsets->data().get(), 1112 loTriFactorT->csrMat->column_indices->data().get(), 1113 loTriFactorT->solveInfo, 1114 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1115 1116 /* restore */ 1117 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1118 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1119 ierr = WaitForGPU();CHKERRCUDA(ierr); 1120 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1121 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1122 PetscFunctionReturn(0); 1123 } 1124 1125 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1126 { 1127 const PetscScalar *barray; 1128 PetscScalar *xarray; 1129 thrust::device_ptr<const PetscScalar> bGPU; 1130 thrust::device_ptr<PetscScalar> xGPU; 1131 cusparseStatus_t stat; 1132 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1133 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1134 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1135 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1136 PetscErrorCode ierr; 1137 1138 PetscFunctionBegin; 1139 1140 /* Get the GPU pointers */ 1141 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1142 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1143 xGPU = thrust::device_pointer_cast(xarray); 1144 bGPU = thrust::device_pointer_cast(barray); 1145 1146 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1147 /* First, reorder with the row permutation */ 1148 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1149 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1150 tempGPU->begin()); 1151 1152 /* Next, solve L */ 1153 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1154 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1155 loTriFactor->csrMat->values->data().get(), 1156 loTriFactor->csrMat->row_offsets->data().get(), 1157 loTriFactor->csrMat->column_indices->data().get(), 1158 loTriFactor->solveInfo, 1159 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1160 1161 /* Then, solve U */ 1162 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1163 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1164 upTriFactor->csrMat->values->data().get(), 1165 upTriFactor->csrMat->row_offsets->data().get(), 1166 upTriFactor->csrMat->column_indices->data().get(), 1167 upTriFactor->solveInfo, 1168 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1169 1170 /* Last, reorder with the column permutation */ 1171 thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1172 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1173 xGPU); 1174 1175 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1176 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1177 ierr = WaitForGPU();CHKERRCUDA(ierr); 1178 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1179 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1180 PetscFunctionReturn(0); 1181 } 1182 1183 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1184 { 1185 const PetscScalar *barray; 1186 PetscScalar *xarray; 1187 cusparseStatus_t stat; 1188 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1189 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1190 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1191 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1192 PetscErrorCode ierr; 1193 1194 PetscFunctionBegin; 1195 /* Get the GPU pointers */ 1196 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1197 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1198 1199 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1200 /* First, solve L */ 1201 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1202 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1203 loTriFactor->csrMat->values->data().get(), 1204 loTriFactor->csrMat->row_offsets->data().get(), 1205 loTriFactor->csrMat->column_indices->data().get(), 1206 loTriFactor->solveInfo, 1207 barray, tempGPU->data().get());CHKERRCUDA(stat); 1208 1209 /* Next, solve U */ 1210 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1211 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1212 upTriFactor->csrMat->values->data().get(), 1213 upTriFactor->csrMat->row_offsets->data().get(), 1214 upTriFactor->csrMat->column_indices->data().get(), 1215 upTriFactor->solveInfo, 1216 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1217 1218 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1219 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1220 ierr = WaitForGPU();CHKERRCUDA(ierr); 1221 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1222 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1223 PetscFunctionReturn(0); 1224 } 1225 1226 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1227 { 1228 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1229 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1230 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1231 PetscInt m = A->rmap->n,*ii,*ridx; 1232 PetscErrorCode ierr; 1233 cusparseStatus_t stat; 1234 cudaError_t err; 1235 1236 PetscFunctionBegin; 1237 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 1238 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1239 if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1240 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1241 /* copy values only */ 1242 matrix->values->assign(a->a, a->a+a->nz); 1243 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1244 } else { 1245 MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format); 1246 try { 1247 cusparsestruct->nonzerorow=0; 1248 for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0); 1249 1250 if (a->compressedrow.use) { 1251 m = a->compressedrow.nrows; 1252 ii = a->compressedrow.i; 1253 ridx = a->compressedrow.rindex; 1254 } else { 1255 /* Forcing compressed row on the GPU */ 1256 int k=0; 1257 ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr); 1258 ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr); 1259 ii[0]=0; 1260 for (int j = 0; j<m; j++) { 1261 if ((a->i[j+1]-a->i[j])>0) { 1262 ii[k] = a->i[j]; 1263 ridx[k]= j; 1264 k++; 1265 } 1266 } 1267 ii[cusparsestruct->nonzerorow] = a->nz; 1268 m = cusparsestruct->nonzerorow; 1269 } 1270 1271 /* allocate space for the triangular factor information */ 1272 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1273 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat); 1274 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 1275 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 1276 1277 err = cudaMalloc((void **)&(matstruct->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 1278 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1279 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1280 err = cudaMemcpy(matstruct->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1281 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1282 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1283 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 1284 1285 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1286 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1287 /* set the matrix */ 1288 CsrMatrix *matrix= new CsrMatrix; 1289 matrix->num_rows = m; 1290 matrix->num_cols = A->cmap->n; 1291 matrix->num_entries = a->nz; 1292 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1293 matrix->row_offsets->assign(ii, ii + m+1); 1294 1295 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1296 matrix->column_indices->assign(a->j, a->j+a->nz); 1297 1298 matrix->values = new THRUSTARRAY(a->nz); 1299 matrix->values->assign(a->a, a->a+a->nz); 1300 1301 /* assign the pointer */ 1302 matstruct->mat = matrix; 1303 1304 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1305 CsrMatrix *matrix= new CsrMatrix; 1306 matrix->num_rows = m; 1307 matrix->num_cols = A->cmap->n; 1308 matrix->num_entries = a->nz; 1309 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1310 matrix->row_offsets->assign(ii, ii + m+1); 1311 1312 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1313 matrix->column_indices->assign(a->j, a->j+a->nz); 1314 1315 matrix->values = new THRUSTARRAY(a->nz); 1316 matrix->values->assign(a->a, a->a+a->nz); 1317 1318 cusparseHybMat_t hybMat; 1319 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 1320 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1321 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1322 stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1323 matstruct->descr, matrix->values->data().get(), 1324 matrix->row_offsets->data().get(), 1325 matrix->column_indices->data().get(), 1326 hybMat, 0, partition);CHKERRCUDA(stat); 1327 /* assign the pointer */ 1328 matstruct->mat = hybMat; 1329 1330 if (matrix) { 1331 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1332 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1333 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1334 delete (CsrMatrix*)matrix; 1335 } 1336 } 1337 1338 /* assign the compressed row indices */ 1339 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1340 matstruct->cprowIndices->assign(ridx,ridx+m); 1341 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+m*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1342 1343 /* assign the pointer */ 1344 cusparsestruct->mat = matstruct; 1345 1346 if (!a->compressedrow.use) { 1347 ierr = PetscFree(ii);CHKERRQ(ierr); 1348 ierr = PetscFree(ridx);CHKERRQ(ierr); 1349 } 1350 cusparsestruct->workVector = new THRUSTARRAY(m); 1351 } catch(char *ex) { 1352 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1353 } 1354 cusparsestruct->nonzerostate = A->nonzerostate; 1355 } 1356 ierr = WaitForGPU();CHKERRCUDA(ierr); 1357 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 1358 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1359 } 1360 PetscFunctionReturn(0); 1361 } 1362 1363 struct VecCUDAPlusEquals 1364 { 1365 template <typename Tuple> 1366 __host__ __device__ 1367 void operator()(Tuple t) 1368 { 1369 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1370 } 1371 }; 1372 1373 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1374 { 1375 PetscErrorCode ierr; 1376 1377 PetscFunctionBegin; 1378 ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr); 1379 PetscFunctionReturn(0); 1380 } 1381 1382 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1383 { 1384 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1385 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1386 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1387 const PetscScalar *xarray; 1388 PetscScalar *yarray; 1389 PetscErrorCode ierr; 1390 cusparseStatus_t stat; 1391 1392 PetscFunctionBegin; 1393 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1394 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1395 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1396 if (!matstructT) { 1397 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1398 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1399 } 1400 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1401 ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1402 1403 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1404 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1405 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1406 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1407 mat->num_rows, mat->num_cols, 1408 mat->num_entries, matstructT->alpha, matstructT->descr, 1409 mat->values->data().get(), mat->row_offsets->data().get(), 1410 mat->column_indices->data().get(), xarray, matstructT->beta_zero, 1411 yarray);CHKERRCUDA(stat); 1412 } else { 1413 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1414 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1415 matstructT->alpha, matstructT->descr, hybMat, 1416 xarray, matstructT->beta_zero, 1417 yarray);CHKERRCUDA(stat); 1418 } 1419 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1420 ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1421 ierr = WaitForGPU();CHKERRCUDA(ierr); 1422 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1423 ierr = PetscLogGpuFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1424 PetscFunctionReturn(0); 1425 } 1426 1427 1428 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1429 { 1430 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1431 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1432 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 1433 const PetscScalar *xarray; 1434 PetscScalar *zarray,*dptr,*beta; 1435 PetscErrorCode ierr; 1436 cusparseStatus_t stat; 1437 PetscBool cmpr; /* if the matrix has been compressed (zero rows) */ 1438 1439 PetscFunctionBegin; 1440 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1441 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1442 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1443 try { 1444 cmpr = (PetscBool)(cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n)); 1445 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1446 if (yy && !cmpr) { /* MatMultAdd with noncompressed storage -> need uptodate zz vector */ 1447 ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr); 1448 } else { 1449 ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1450 } 1451 dptr = cmpr ? zarray : cusparsestruct->workVector->data().get(); 1452 beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero; 1453 1454 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1455 /* csr_spmv is multiply add */ 1456 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1457 /* here we need to be careful to set the number of rows in the multiply to the 1458 number of compressed rows in the matrix ... which is equivalent to the 1459 size of the workVector */ 1460 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1461 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1462 mat->num_rows, mat->num_cols, 1463 mat->num_entries, matstruct->alpha, matstruct->descr, 1464 mat->values->data().get(), mat->row_offsets->data().get(), 1465 mat->column_indices->data().get(), xarray, beta, 1466 dptr);CHKERRCUDA(stat); 1467 } else { 1468 if (cusparsestruct->workVector->size()) { 1469 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1470 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1471 matstruct->alpha, matstruct->descr, hybMat, 1472 xarray, beta, 1473 dptr);CHKERRCUDA(stat); 1474 } 1475 } 1476 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1477 1478 if (yy) { 1479 if (dptr != zarray) { 1480 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1481 } else if (zz != yy) { 1482 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); 1483 } 1484 } else if (dptr != zarray) { 1485 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 1486 } 1487 /* scatter the data from the temporary into the full vector with a += operation */ 1488 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1489 if (dptr != zarray) { 1490 thrust::device_ptr<PetscScalar> zptr; 1491 1492 zptr = thrust::device_pointer_cast(zarray); 1493 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 1494 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1495 VecCUDAPlusEquals()); 1496 } 1497 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1498 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1499 if (yy && !cmpr) { 1500 ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr); 1501 } else { 1502 ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1503 } 1504 } catch(char *ex) { 1505 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1506 } 1507 ierr = WaitForGPU();CHKERRCUDA(ierr); 1508 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1509 PetscFunctionReturn(0); 1510 } 1511 1512 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1513 { 1514 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1515 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1516 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1517 const PetscScalar *xarray; 1518 PetscScalar *zarray,*dptr,*beta; 1519 PetscErrorCode ierr; 1520 cusparseStatus_t stat; 1521 1522 PetscFunctionBegin; 1523 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1524 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1525 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1526 if (!matstructT) { 1527 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1528 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1529 } 1530 1531 try { 1532 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1533 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1534 ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr); 1535 dptr = cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->cmap->n) ? zarray : cusparsestruct->workVector->data().get(); 1536 beta = (yy == zz && dptr == zarray) ? matstructT->beta_one : matstructT->beta_zero; 1537 1538 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1539 /* multiply add with matrix transpose */ 1540 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1541 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1542 /* here we need to be careful to set the number of rows in the multiply to the 1543 number of compressed rows in the matrix ... which is equivalent to the 1544 size of the workVector */ 1545 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1546 mat->num_rows, mat->num_cols, 1547 mat->num_entries, matstructT->alpha, matstructT->descr, 1548 mat->values->data().get(), mat->row_offsets->data().get(), 1549 mat->column_indices->data().get(), xarray, beta, 1550 dptr);CHKERRCUDA(stat); 1551 } else { 1552 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1553 if (cusparsestruct->workVector->size()) { 1554 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1555 matstructT->alpha, matstructT->descr, hybMat, 1556 xarray, beta, 1557 dptr);CHKERRCUDA(stat); 1558 } 1559 } 1560 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1561 1562 if (dptr != zarray) { 1563 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1564 } else if (zz != yy) { 1565 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); 1566 } 1567 /* scatter the data from the temporary into the full vector with a += operation */ 1568 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1569 if (dptr != zarray) { 1570 thrust::device_ptr<PetscScalar> zptr; 1571 1572 zptr = thrust::device_pointer_cast(zarray); 1573 1574 /* scatter the data from the temporary into the full vector with a += operation */ 1575 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))), 1576 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n, 1577 VecCUDAPlusEquals()); 1578 } 1579 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1580 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1581 ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr); 1582 } catch(char *ex) { 1583 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1584 } 1585 ierr = WaitForGPU();CHKERRCUDA(ierr); /* is this needed? just for yy==0 in Mult */ 1586 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1587 PetscFunctionReturn(0); 1588 } 1589 1590 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1591 { 1592 PetscErrorCode ierr; 1593 1594 PetscFunctionBegin; 1595 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1596 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1597 if (A->factortype == MAT_FACTOR_NONE) { 1598 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1599 } 1600 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1601 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1602 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1603 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1604 PetscFunctionReturn(0); 1605 } 1606 1607 /* --------------------------------------------------------------------------------*/ 1608 /*@ 1609 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1610 (the default parallel PETSc format). This matrix will ultimately pushed down 1611 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1612 assembly performance the user should preallocate the matrix storage by setting 1613 the parameter nz (or the array nnz). By setting these parameters accurately, 1614 performance during matrix assembly can be increased by more than a factor of 50. 1615 1616 Collective 1617 1618 Input Parameters: 1619 + comm - MPI communicator, set to PETSC_COMM_SELF 1620 . m - number of rows 1621 . n - number of columns 1622 . nz - number of nonzeros per row (same for all rows) 1623 - nnz - array containing the number of nonzeros in the various rows 1624 (possibly different for each row) or NULL 1625 1626 Output Parameter: 1627 . A - the matrix 1628 1629 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1630 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1631 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1632 1633 Notes: 1634 If nnz is given then nz is ignored 1635 1636 The AIJ format (also called the Yale sparse matrix format or 1637 compressed row storage), is fully compatible with standard Fortran 77 1638 storage. That is, the stored row and column indices can begin at 1639 either one (as in Fortran) or zero. See the users' manual for details. 1640 1641 Specify the preallocated storage with either nz or nnz (not both). 1642 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1643 allocation. For large problems you MUST preallocate memory or you 1644 will get TERRIBLE performance, see the users' manual chapter on matrices. 1645 1646 By default, this format uses inodes (identical nodes) when possible, to 1647 improve numerical efficiency of matrix-vector products and solves. We 1648 search for consecutive rows with the same nonzero structure, thereby 1649 reusing matrix information to achieve increased efficiency. 1650 1651 Level: intermediate 1652 1653 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1654 @*/ 1655 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1656 { 1657 PetscErrorCode ierr; 1658 1659 PetscFunctionBegin; 1660 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1661 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1662 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1663 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1664 PetscFunctionReturn(0); 1665 } 1666 1667 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1668 { 1669 PetscErrorCode ierr; 1670 1671 PetscFunctionBegin; 1672 if (A->factortype==MAT_FACTOR_NONE) { 1673 if (A->valid_GPU_matrix != PETSC_OFFLOAD_UNALLOCATED) { 1674 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 1675 } 1676 } else { 1677 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 1678 } 1679 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1680 PetscFunctionReturn(0); 1681 } 1682 1683 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 1684 { 1685 PetscErrorCode ierr; 1686 Mat C; 1687 cusparseStatus_t stat; 1688 cusparseHandle_t handle=0; 1689 1690 PetscFunctionBegin; 1691 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 1692 C = *B; 1693 ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr); 1694 ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr); 1695 1696 /* inject CUSPARSE-specific stuff */ 1697 if (C->factortype==MAT_FACTOR_NONE) { 1698 /* you cannot check the inode.use flag here since the matrix was just created. 1699 now build a GPU matrix data structure */ 1700 C->spptr = new Mat_SeqAIJCUSPARSE; 1701 ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat = 0; 1702 ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0; 1703 ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector = 0; 1704 ((Mat_SeqAIJCUSPARSE*)C->spptr)->format = MAT_CUSPARSE_CSR; 1705 ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream = 0; 1706 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1707 ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle = handle; 1708 ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0; 1709 } else { 1710 /* NEXT, set the pointers to the triangular factors */ 1711 C->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1712 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr = 0; 1713 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr = 0; 1714 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0; 1715 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0; 1716 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices = 0; 1717 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices = 0; 1718 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector = 0; 1719 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = 0; 1720 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1721 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = handle; 1722 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz = 0; 1723 } 1724 1725 C->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1726 C->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1727 C->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1728 C->ops->mult = MatMult_SeqAIJCUSPARSE; 1729 C->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1730 C->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1731 C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1732 C->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1733 1734 ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1735 1736 C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED; 1737 1738 ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1739 PetscFunctionReturn(0); 1740 } 1741 1742 PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B) 1743 { 1744 PetscErrorCode ierr; 1745 cusparseStatus_t stat; 1746 cusparseHandle_t handle=0; 1747 1748 PetscFunctionBegin; 1749 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 1750 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 1751 1752 if (B->factortype==MAT_FACTOR_NONE) { 1753 /* you cannot check the inode.use flag here since the matrix was just created. 1754 now build a GPU matrix data structure */ 1755 B->spptr = new Mat_SeqAIJCUSPARSE; 1756 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1757 ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0; 1758 ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0; 1759 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1760 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1761 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1762 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle; 1763 ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0; 1764 } else { 1765 /* NEXT, set the pointers to the triangular factors */ 1766 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1767 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 1768 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1769 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0; 1770 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0; 1771 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0; 1772 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0; 1773 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0; 1774 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1775 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle; 1776 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0; 1777 } 1778 1779 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1780 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1781 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1782 B->ops->mult = MatMult_SeqAIJCUSPARSE; 1783 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1784 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1785 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1786 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1787 1788 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1789 1790 B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED; 1791 1792 ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1793 PetscFunctionReturn(0); 1794 } 1795 1796 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 1797 { 1798 PetscErrorCode ierr; 1799 1800 PetscFunctionBegin; 1801 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 1802 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);CHKERRQ(ierr); 1803 PetscFunctionReturn(0); 1804 } 1805 1806 /*MC 1807 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1808 1809 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1810 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 1811 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 1812 1813 Options Database Keys: 1814 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1815 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1816 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1817 1818 Level: beginner 1819 1820 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1821 M*/ 1822 1823 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 1824 1825 1826 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 1827 { 1828 PetscErrorCode ierr; 1829 1830 PetscFunctionBegin; 1831 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1832 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1833 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1834 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1835 PetscFunctionReturn(0); 1836 } 1837 1838 1839 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 1840 { 1841 cusparseStatus_t stat; 1842 cusparseHandle_t handle; 1843 1844 PetscFunctionBegin; 1845 if (*cusparsestruct) { 1846 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format); 1847 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format); 1848 delete (*cusparsestruct)->workVector; 1849 if (handle = (*cusparsestruct)->handle) { 1850 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1851 } 1852 delete *cusparsestruct; 1853 *cusparsestruct = 0; 1854 } 1855 PetscFunctionReturn(0); 1856 } 1857 1858 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 1859 { 1860 PetscFunctionBegin; 1861 if (*mat) { 1862 delete (*mat)->values; 1863 delete (*mat)->column_indices; 1864 delete (*mat)->row_offsets; 1865 delete *mat; 1866 *mat = 0; 1867 } 1868 PetscFunctionReturn(0); 1869 } 1870 1871 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 1872 { 1873 cusparseStatus_t stat; 1874 PetscErrorCode ierr; 1875 1876 PetscFunctionBegin; 1877 if (*trifactor) { 1878 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); } 1879 if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); } 1880 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 1881 delete *trifactor; 1882 *trifactor = 0; 1883 } 1884 PetscFunctionReturn(0); 1885 } 1886 1887 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 1888 { 1889 CsrMatrix *mat; 1890 cusparseStatus_t stat; 1891 cudaError_t err; 1892 1893 PetscFunctionBegin; 1894 if (*matstruct) { 1895 if ((*matstruct)->mat) { 1896 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 1897 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 1898 stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat); 1899 } else { 1900 mat = (CsrMatrix*)(*matstruct)->mat; 1901 CsrMatrix_Destroy(&mat); 1902 } 1903 } 1904 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); } 1905 delete (*matstruct)->cprowIndices; 1906 if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); } 1907 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 1908 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 1909 delete *matstruct; 1910 *matstruct = 0; 1911 } 1912 PetscFunctionReturn(0); 1913 } 1914 1915 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 1916 { 1917 cusparseHandle_t handle; 1918 cusparseStatus_t stat; 1919 1920 PetscFunctionBegin; 1921 if (*trifactors) { 1922 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr); 1923 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr); 1924 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose); 1925 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose); 1926 delete (*trifactors)->rpermIndices; 1927 delete (*trifactors)->cpermIndices; 1928 delete (*trifactors)->workVector; 1929 if (handle = (*trifactors)->handle) { 1930 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1931 } 1932 delete *trifactors; 1933 *trifactors = 0; 1934 } 1935 PetscFunctionReturn(0); 1936 } 1937 1938