1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_CXX_COMPLEX_FIX 7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 8 9 #include <petscconf.h> 10 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11 #include <../src/mat/impls/sbaij/seq/sbaij.h> 12 #include <../src/vec/vec/impls/dvecimpl.h> 13 #include <petsc/private/vecimpl.h> 14 #undef VecType 15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 19 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 20 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 21 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 22 23 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 24 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 25 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 26 27 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 28 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 29 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 30 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 31 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 32 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 33 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 34 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 35 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 36 37 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 38 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 39 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 40 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 41 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 42 43 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 44 { 45 cusparseStatus_t stat; 46 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 47 48 PetscFunctionBegin; 49 cusparsestruct->stream = stream; 50 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat); 51 PetscFunctionReturn(0); 52 } 53 54 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 55 { 56 cusparseStatus_t stat; 57 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 58 59 PetscFunctionBegin; 60 if (cusparsestruct->handle != handle) { 61 if (cusparsestruct->handle) { 62 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat); 63 } 64 cusparsestruct->handle = handle; 65 } 66 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 67 PetscFunctionReturn(0); 68 } 69 70 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 71 { 72 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 73 PetscFunctionBegin; 74 if (cusparsestruct->handle) 75 cusparsestruct->handle = 0; 76 PetscFunctionReturn(0); 77 } 78 79 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 80 { 81 PetscFunctionBegin; 82 *type = MATSOLVERCUSPARSE; 83 PetscFunctionReturn(0); 84 } 85 86 /*MC 87 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 88 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 89 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 90 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 91 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 92 algorithms are not recommended. This class does NOT support direct solver operations. 93 94 Level: beginner 95 96 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 97 M*/ 98 99 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 100 { 101 PetscErrorCode ierr; 102 PetscInt n = A->rmap->n; 103 104 PetscFunctionBegin; 105 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 106 (*B)->factortype = ftype; 107 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 108 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 109 110 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 111 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 112 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 113 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 114 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 115 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 116 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 117 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 118 119 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 120 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 121 PetscFunctionReturn(0); 122 } 123 124 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 125 { 126 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 127 128 PetscFunctionBegin; 129 switch (op) { 130 case MAT_CUSPARSE_MULT: 131 cusparsestruct->format = format; 132 break; 133 case MAT_CUSPARSE_ALL: 134 cusparsestruct->format = format; 135 break; 136 default: 137 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 138 } 139 PetscFunctionReturn(0); 140 } 141 142 /*@ 143 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 144 operation. Only the MatMult operation can use different GPU storage formats 145 for MPIAIJCUSPARSE matrices. 146 Not Collective 147 148 Input Parameters: 149 + A - Matrix of type SEQAIJCUSPARSE 150 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 151 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 152 153 Output Parameter: 154 155 Level: intermediate 156 157 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 158 @*/ 159 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 160 { 161 PetscErrorCode ierr; 162 163 PetscFunctionBegin; 164 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 165 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 166 PetscFunctionReturn(0); 167 } 168 169 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 170 { 171 PetscErrorCode ierr; 172 MatCUSPARSEStorageFormat format; 173 PetscBool flg; 174 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 175 176 PetscFunctionBegin; 177 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 178 if (A->factortype==MAT_FACTOR_NONE) { 179 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 180 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 181 if (flg) { 182 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 183 } 184 } 185 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 186 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 187 if (flg) { 188 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 189 } 190 ierr = PetscOptionsTail();CHKERRQ(ierr); 191 PetscFunctionReturn(0); 192 193 } 194 195 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 196 { 197 PetscErrorCode ierr; 198 199 PetscFunctionBegin; 200 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 201 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 202 PetscFunctionReturn(0); 203 } 204 205 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 206 { 207 PetscErrorCode ierr; 208 209 PetscFunctionBegin; 210 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 211 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 212 PetscFunctionReturn(0); 213 } 214 215 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 216 { 217 PetscErrorCode ierr; 218 219 PetscFunctionBegin; 220 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 221 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 222 PetscFunctionReturn(0); 223 } 224 225 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 226 { 227 PetscErrorCode ierr; 228 229 PetscFunctionBegin; 230 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 231 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 232 PetscFunctionReturn(0); 233 } 234 235 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 236 { 237 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 238 PetscInt n = A->rmap->n; 239 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 240 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 241 cusparseStatus_t stat; 242 const PetscInt *ai = a->i,*aj = a->j,*vi; 243 const MatScalar *aa = a->a,*v; 244 PetscInt *AiLo, *AjLo; 245 PetscScalar *AALo; 246 PetscInt i,nz, nzLower, offset, rowOffset; 247 PetscErrorCode ierr; 248 249 PetscFunctionBegin; 250 if (!n) PetscFunctionReturn(0); 251 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 252 try { 253 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 254 nzLower=n+ai[n]-ai[1]; 255 256 /* Allocate Space for the lower triangular matrix */ 257 ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 258 ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr); 259 ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr); 260 261 /* Fill the lower triangular matrix */ 262 AiLo[0] = (PetscInt) 0; 263 AiLo[n] = nzLower; 264 AjLo[0] = (PetscInt) 0; 265 AALo[0] = (MatScalar) 1.0; 266 v = aa; 267 vi = aj; 268 offset = 1; 269 rowOffset= 1; 270 for (i=1; i<n; i++) { 271 nz = ai[i+1] - ai[i]; 272 /* additional 1 for the term on the diagonal */ 273 AiLo[i] = rowOffset; 274 rowOffset += nz+1; 275 276 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 277 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 278 279 offset += nz; 280 AjLo[offset] = (PetscInt) i; 281 AALo[offset] = (MatScalar) 1.0; 282 offset += 1; 283 284 v += nz; 285 vi += nz; 286 } 287 288 /* allocate space for the triangular factor information */ 289 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 290 291 /* Create the matrix description */ 292 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 293 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 294 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 295 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat); 296 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 297 298 /* Create the solve analysis information */ 299 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 300 301 /* set the operation */ 302 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 303 304 /* set the matrix */ 305 loTriFactor->csrMat = new CsrMatrix; 306 loTriFactor->csrMat->num_rows = n; 307 loTriFactor->csrMat->num_cols = n; 308 loTriFactor->csrMat->num_entries = nzLower; 309 310 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 311 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 312 313 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 314 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 315 316 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 317 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 318 319 /* perform the solve analysis */ 320 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 321 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 322 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 323 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 324 325 /* assign the pointer. Is this really necessary? */ 326 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 327 328 ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr); 329 ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr); 330 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 331 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 332 } catch(char *ex) { 333 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 334 } 335 } 336 PetscFunctionReturn(0); 337 } 338 339 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 340 { 341 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 342 PetscInt n = A->rmap->n; 343 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 344 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 345 cusparseStatus_t stat; 346 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 347 const MatScalar *aa = a->a,*v; 348 PetscInt *AiUp, *AjUp; 349 PetscScalar *AAUp; 350 PetscInt i,nz, nzUpper, offset; 351 PetscErrorCode ierr; 352 353 PetscFunctionBegin; 354 if (!n) PetscFunctionReturn(0); 355 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 356 try { 357 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 358 nzUpper = adiag[0]-adiag[n]; 359 360 /* Allocate Space for the upper triangular matrix */ 361 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 362 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 363 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 364 365 /* Fill the upper triangular matrix */ 366 AiUp[0]=(PetscInt) 0; 367 AiUp[n]=nzUpper; 368 offset = nzUpper; 369 for (i=n-1; i>=0; i--) { 370 v = aa + adiag[i+1] + 1; 371 vi = aj + adiag[i+1] + 1; 372 373 /* number of elements NOT on the diagonal */ 374 nz = adiag[i] - adiag[i+1]-1; 375 376 /* decrement the offset */ 377 offset -= (nz+1); 378 379 /* first, set the diagonal elements */ 380 AjUp[offset] = (PetscInt) i; 381 AAUp[offset] = (MatScalar)1./v[nz]; 382 AiUp[i] = AiUp[i+1] - (nz+1); 383 384 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 385 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 386 } 387 388 /* allocate space for the triangular factor information */ 389 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 390 391 /* Create the matrix description */ 392 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 393 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 394 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 395 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 396 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 397 398 /* Create the solve analysis information */ 399 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 400 401 /* set the operation */ 402 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 403 404 /* set the matrix */ 405 upTriFactor->csrMat = new CsrMatrix; 406 upTriFactor->csrMat->num_rows = n; 407 upTriFactor->csrMat->num_cols = n; 408 upTriFactor->csrMat->num_entries = nzUpper; 409 410 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 411 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 412 413 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 414 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 415 416 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 417 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 418 419 /* perform the solve analysis */ 420 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 421 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 422 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 423 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 424 425 /* assign the pointer. Is this really necessary? */ 426 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 427 428 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 429 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 430 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 431 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 432 } catch(char *ex) { 433 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 434 } 435 } 436 PetscFunctionReturn(0); 437 } 438 439 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 440 { 441 PetscErrorCode ierr; 442 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 443 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 444 IS isrow = a->row,iscol = a->icol; 445 PetscBool row_identity,col_identity; 446 const PetscInt *r,*c; 447 PetscInt n = A->rmap->n; 448 449 PetscFunctionBegin; 450 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 451 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 452 453 cusparseTriFactors->workVector = new THRUSTARRAY(n); 454 cusparseTriFactors->nnz=a->nz; 455 456 A->offloadmask = PETSC_OFFLOAD_BOTH; 457 /* lower triangular indices */ 458 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 459 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 460 if (!row_identity) { 461 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 462 cusparseTriFactors->rpermIndices->assign(r, r+n); 463 } 464 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 465 466 /* upper triangular indices */ 467 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 468 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 469 if (!col_identity) { 470 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 471 cusparseTriFactors->cpermIndices->assign(c, c+n); 472 } 473 474 if (!row_identity && !col_identity) { 475 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 476 } else if(!row_identity) { 477 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 478 } else if(!col_identity) { 479 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 480 } 481 482 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 483 PetscFunctionReturn(0); 484 } 485 486 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 487 { 488 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 489 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 490 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 491 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 492 cusparseStatus_t stat; 493 PetscErrorCode ierr; 494 PetscInt *AiUp, *AjUp; 495 PetscScalar *AAUp; 496 PetscScalar *AALo; 497 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 498 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 499 const PetscInt *ai = b->i,*aj = b->j,*vj; 500 const MatScalar *aa = b->a,*v; 501 502 PetscFunctionBegin; 503 if (!n) PetscFunctionReturn(0); 504 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 505 try { 506 /* Allocate Space for the upper triangular matrix */ 507 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 508 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 509 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 510 ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 511 512 /* Fill the upper triangular matrix */ 513 AiUp[0]=(PetscInt) 0; 514 AiUp[n]=nzUpper; 515 offset = 0; 516 for (i=0; i<n; i++) { 517 /* set the pointers */ 518 v = aa + ai[i]; 519 vj = aj + ai[i]; 520 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 521 522 /* first, set the diagonal elements */ 523 AjUp[offset] = (PetscInt) i; 524 AAUp[offset] = (MatScalar)1.0/v[nz]; 525 AiUp[i] = offset; 526 AALo[offset] = (MatScalar)1.0/v[nz]; 527 528 offset+=1; 529 if (nz>0) { 530 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 531 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 532 for (j=offset; j<offset+nz; j++) { 533 AAUp[j] = -AAUp[j]; 534 AALo[j] = AAUp[j]/v[nz]; 535 } 536 offset+=nz; 537 } 538 } 539 540 /* allocate space for the triangular factor information */ 541 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 542 543 /* Create the matrix description */ 544 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 545 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 546 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 547 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 548 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 549 550 /* Create the solve analysis information */ 551 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 552 553 /* set the operation */ 554 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 555 556 /* set the matrix */ 557 upTriFactor->csrMat = new CsrMatrix; 558 upTriFactor->csrMat->num_rows = A->rmap->n; 559 upTriFactor->csrMat->num_cols = A->cmap->n; 560 upTriFactor->csrMat->num_entries = a->nz; 561 562 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 563 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 564 565 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 566 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 567 568 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 569 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 570 571 /* perform the solve analysis */ 572 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 573 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 574 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 575 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 576 577 /* assign the pointer. Is this really necessary? */ 578 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 579 580 /* allocate space for the triangular factor information */ 581 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 582 583 /* Create the matrix description */ 584 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 585 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 586 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 587 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 588 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 589 590 /* Create the solve analysis information */ 591 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 592 593 /* set the operation */ 594 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 595 596 /* set the matrix */ 597 loTriFactor->csrMat = new CsrMatrix; 598 loTriFactor->csrMat->num_rows = A->rmap->n; 599 loTriFactor->csrMat->num_cols = A->cmap->n; 600 loTriFactor->csrMat->num_entries = a->nz; 601 602 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 603 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 604 605 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 606 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 607 608 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 609 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 610 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 611 612 /* perform the solve analysis */ 613 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 614 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 615 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 616 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 617 618 /* assign the pointer. Is this really necessary? */ 619 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 620 621 A->offloadmask = PETSC_OFFLOAD_BOTH; 622 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 623 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 624 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 625 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 626 } catch(char *ex) { 627 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 628 } 629 } 630 PetscFunctionReturn(0); 631 } 632 633 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 634 { 635 PetscErrorCode ierr; 636 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 637 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 638 IS ip = a->row; 639 const PetscInt *rip; 640 PetscBool perm_identity; 641 PetscInt n = A->rmap->n; 642 643 PetscFunctionBegin; 644 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 645 cusparseTriFactors->workVector = new THRUSTARRAY(n); 646 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 647 648 /* lower triangular indices */ 649 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 650 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 651 if (!perm_identity) { 652 IS iip; 653 const PetscInt *irip; 654 655 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 656 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 657 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 658 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 659 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 660 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 661 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 662 ierr = ISDestroy(&iip);CHKERRQ(ierr); 663 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 664 } 665 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 666 PetscFunctionReturn(0); 667 } 668 669 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 670 { 671 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 672 IS isrow = b->row,iscol = b->col; 673 PetscBool row_identity,col_identity; 674 PetscErrorCode ierr; 675 676 PetscFunctionBegin; 677 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 678 /* determine which version of MatSolve needs to be used. */ 679 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 680 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 681 if (row_identity && col_identity) { 682 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 683 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 684 B->ops->matsolve = NULL; 685 B->ops->matsolvetranspose = NULL; 686 } else { 687 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 688 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 689 B->ops->matsolve = NULL; 690 B->ops->matsolvetranspose = NULL; 691 } 692 693 /* get the triangular factors */ 694 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 695 PetscFunctionReturn(0); 696 } 697 698 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 699 { 700 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 701 IS ip = b->row; 702 PetscBool perm_identity; 703 PetscErrorCode ierr; 704 705 PetscFunctionBegin; 706 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 707 708 /* determine which version of MatSolve needs to be used. */ 709 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 710 if (perm_identity) { 711 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 712 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 713 B->ops->matsolve = NULL; 714 B->ops->matsolvetranspose = NULL; 715 } else { 716 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 717 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 718 B->ops->matsolve = NULL; 719 B->ops->matsolvetranspose = NULL; 720 } 721 722 /* get the triangular factors */ 723 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 724 PetscFunctionReturn(0); 725 } 726 727 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 728 { 729 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 730 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 731 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 732 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 733 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 734 cusparseStatus_t stat; 735 cusparseIndexBase_t indexBase; 736 cusparseMatrixType_t matrixType; 737 cusparseFillMode_t fillMode; 738 cusparseDiagType_t diagType; 739 740 PetscFunctionBegin; 741 742 /*********************************************/ 743 /* Now the Transpose of the Lower Tri Factor */ 744 /*********************************************/ 745 746 /* allocate space for the transpose of the lower triangular factor */ 747 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 748 749 /* set the matrix descriptors of the lower triangular factor */ 750 matrixType = cusparseGetMatType(loTriFactor->descr); 751 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 752 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 753 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 754 diagType = cusparseGetMatDiagType(loTriFactor->descr); 755 756 /* Create the matrix description */ 757 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat); 758 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat); 759 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat); 760 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat); 761 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat); 762 763 /* Create the solve analysis information */ 764 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat); 765 766 /* set the operation */ 767 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 768 769 /* allocate GPU space for the CSC of the lower triangular factor*/ 770 loTriFactorT->csrMat = new CsrMatrix; 771 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 772 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 773 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 774 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 775 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 776 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 777 778 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 779 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 780 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 781 loTriFactor->csrMat->values->data().get(), 782 loTriFactor->csrMat->row_offsets->data().get(), 783 loTriFactor->csrMat->column_indices->data().get(), 784 loTriFactorT->csrMat->values->data().get(), 785 loTriFactorT->csrMat->column_indices->data().get(), 786 loTriFactorT->csrMat->row_offsets->data().get(), 787 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 788 789 /* perform the solve analysis on the transposed matrix */ 790 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 791 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 792 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 793 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 794 loTriFactorT->solveInfo);CHKERRCUDA(stat); 795 796 /* assign the pointer. Is this really necessary? */ 797 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 798 799 /*********************************************/ 800 /* Now the Transpose of the Upper Tri Factor */ 801 /*********************************************/ 802 803 /* allocate space for the transpose of the upper triangular factor */ 804 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 805 806 /* set the matrix descriptors of the upper triangular factor */ 807 matrixType = cusparseGetMatType(upTriFactor->descr); 808 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 809 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 810 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 811 diagType = cusparseGetMatDiagType(upTriFactor->descr); 812 813 /* Create the matrix description */ 814 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat); 815 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat); 816 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat); 817 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat); 818 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat); 819 820 /* Create the solve analysis information */ 821 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat); 822 823 /* set the operation */ 824 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 825 826 /* allocate GPU space for the CSC of the upper triangular factor*/ 827 upTriFactorT->csrMat = new CsrMatrix; 828 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 829 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 830 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 831 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 832 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 833 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 834 835 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 836 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 837 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 838 upTriFactor->csrMat->values->data().get(), 839 upTriFactor->csrMat->row_offsets->data().get(), 840 upTriFactor->csrMat->column_indices->data().get(), 841 upTriFactorT->csrMat->values->data().get(), 842 upTriFactorT->csrMat->column_indices->data().get(), 843 upTriFactorT->csrMat->row_offsets->data().get(), 844 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 845 846 /* perform the solve analysis on the transposed matrix */ 847 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 848 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 849 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 850 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 851 upTriFactorT->solveInfo);CHKERRCUDA(stat); 852 853 /* assign the pointer. Is this really necessary? */ 854 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 855 PetscFunctionReturn(0); 856 } 857 858 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 859 { 860 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 861 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 862 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 863 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 864 cusparseStatus_t stat; 865 cusparseIndexBase_t indexBase; 866 cudaError_t err; 867 PetscErrorCode ierr; 868 869 PetscFunctionBegin; 870 871 /* allocate space for the triangular factor information */ 872 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 873 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat); 874 indexBase = cusparseGetMatIndexBase(matstruct->descr); 875 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat); 876 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 877 878 /* set alpha and beta */ 879 err = cudaMalloc((void **)&(matstructT->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 880 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 881 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 882 err = cudaMemcpy(matstructT->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 883 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 884 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 885 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 886 887 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 888 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 889 CsrMatrix *matrixT= new CsrMatrix; 890 thrust::device_vector<int> *rowoffsets_gpu; 891 const int ncomprow = matstruct->cprowIndices->size(); 892 thrust::host_vector<int> cprow(ncomprow); 893 cprow = *matstruct->cprowIndices; // GPU --> CPU 894 matrixT->num_rows = A->cmap->n; 895 matrixT->num_cols = A->rmap->n; 896 matrixT->num_entries = a->nz; 897 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 898 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 899 matrixT->values = new THRUSTARRAY(a->nz); 900 PetscInt k,i; 901 thrust::host_vector<int> rowst_host(A->rmap->n+1); 902 903 /* expand compress rows, which is forced in constructor (MatSeqAIJCUSPARSECopyToGPU) */ 904 rowst_host[0] = 0; 905 for (k = 0, i = 0; i < A->rmap->n ; i++) { 906 if (k < ncomprow && i==cprow[k]) { 907 rowst_host[i+1] = a->i[i+1]; 908 k++; 909 } else rowst_host[i+1] = rowst_host[i]; 910 } 911 if (k!=ncomprow) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"%D != %D",k,ncomprow); 912 rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 913 *rowoffsets_gpu = rowst_host; // CPU --> GPU 914 915 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 916 indexBase = cusparseGetMatIndexBase(matstruct->descr); 917 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 918 A->cmap->n, matrix->num_entries, 919 matrix->values->data().get(), 920 rowoffsets_gpu->data().get(), 921 matrix->column_indices->data().get(), 922 matrixT->values->data().get(), 923 matrixT->column_indices->data().get(), 924 matrixT->row_offsets->data().get(), 925 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 926 927 /* assign the pointer */ 928 matstructT->mat = matrixT; 929 ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 930 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 931 /* First convert HYB to CSR */ 932 CsrMatrix *temp= new CsrMatrix; 933 temp->num_rows = A->rmap->n; 934 temp->num_cols = A->cmap->n; 935 temp->num_entries = a->nz; 936 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 937 temp->column_indices = new THRUSTINTARRAY32(a->nz); 938 temp->values = new THRUSTARRAY(a->nz); 939 940 941 stat = cusparse_hyb2csr(cusparsestruct->handle, 942 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 943 temp->values->data().get(), 944 temp->row_offsets->data().get(), 945 temp->column_indices->data().get());CHKERRCUDA(stat); 946 947 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 948 CsrMatrix *tempT= new CsrMatrix; 949 tempT->num_rows = A->rmap->n; 950 tempT->num_cols = A->cmap->n; 951 tempT->num_entries = a->nz; 952 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 953 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 954 tempT->values = new THRUSTARRAY(a->nz); 955 956 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 957 temp->num_cols, temp->num_entries, 958 temp->values->data().get(), 959 temp->row_offsets->data().get(), 960 temp->column_indices->data().get(), 961 tempT->values->data().get(), 962 tempT->column_indices->data().get(), 963 tempT->row_offsets->data().get(), 964 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 965 966 /* Last, convert CSC to HYB */ 967 cusparseHybMat_t hybMat; 968 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 969 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 970 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 971 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 972 matstructT->descr, tempT->values->data().get(), 973 tempT->row_offsets->data().get(), 974 tempT->column_indices->data().get(), 975 hybMat, 0, partition);CHKERRCUDA(stat); 976 977 /* assign the pointer */ 978 matstructT->mat = hybMat; 979 ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr); 980 981 /* delete temporaries */ 982 if (tempT) { 983 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 984 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 985 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 986 delete (CsrMatrix*) tempT; 987 } 988 if (temp) { 989 if (temp->values) delete (THRUSTARRAY*) temp->values; 990 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 991 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 992 delete (CsrMatrix*) temp; 993 } 994 } 995 /* assign the compressed row indices */ 996 matstructT->cprowIndices = new THRUSTINTARRAY; 997 matstructT->cprowIndices->resize(A->cmap->n); 998 thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end()); 999 /* assign the pointer */ 1000 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1001 PetscFunctionReturn(0); 1002 } 1003 1004 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1005 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1006 { 1007 PetscInt n = xx->map->n; 1008 const PetscScalar *barray; 1009 PetscScalar *xarray; 1010 thrust::device_ptr<const PetscScalar> bGPU; 1011 thrust::device_ptr<PetscScalar> xGPU; 1012 cusparseStatus_t stat; 1013 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1014 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1015 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1016 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1017 PetscErrorCode ierr; 1018 1019 PetscFunctionBegin; 1020 /* Analyze the matrix and create the transpose ... on the fly */ 1021 if (!loTriFactorT && !upTriFactorT) { 1022 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1023 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1024 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1025 } 1026 1027 /* Get the GPU pointers */ 1028 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1029 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1030 xGPU = thrust::device_pointer_cast(xarray); 1031 bGPU = thrust::device_pointer_cast(barray); 1032 1033 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1034 /* First, reorder with the row permutation */ 1035 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1036 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1037 xGPU); 1038 1039 /* First, solve U */ 1040 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1041 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1042 upTriFactorT->csrMat->values->data().get(), 1043 upTriFactorT->csrMat->row_offsets->data().get(), 1044 upTriFactorT->csrMat->column_indices->data().get(), 1045 upTriFactorT->solveInfo, 1046 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1047 1048 /* Then, solve L */ 1049 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1050 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1051 loTriFactorT->csrMat->values->data().get(), 1052 loTriFactorT->csrMat->row_offsets->data().get(), 1053 loTriFactorT->csrMat->column_indices->data().get(), 1054 loTriFactorT->solveInfo, 1055 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1056 1057 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1058 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1059 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1060 tempGPU->begin()); 1061 1062 /* Copy the temporary to the full solution. */ 1063 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1064 1065 /* restore */ 1066 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1067 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1068 ierr = WaitForGPU();CHKERRCUDA(ierr); 1069 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1070 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1071 PetscFunctionReturn(0); 1072 } 1073 1074 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1075 { 1076 const PetscScalar *barray; 1077 PetscScalar *xarray; 1078 cusparseStatus_t stat; 1079 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1080 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1081 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1082 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1083 PetscErrorCode ierr; 1084 1085 PetscFunctionBegin; 1086 /* Analyze the matrix and create the transpose ... on the fly */ 1087 if (!loTriFactorT && !upTriFactorT) { 1088 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1089 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1090 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1091 } 1092 1093 /* Get the GPU pointers */ 1094 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1095 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1096 1097 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1098 /* First, solve U */ 1099 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1100 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1101 upTriFactorT->csrMat->values->data().get(), 1102 upTriFactorT->csrMat->row_offsets->data().get(), 1103 upTriFactorT->csrMat->column_indices->data().get(), 1104 upTriFactorT->solveInfo, 1105 barray, tempGPU->data().get());CHKERRCUDA(stat); 1106 1107 /* Then, solve L */ 1108 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1109 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1110 loTriFactorT->csrMat->values->data().get(), 1111 loTriFactorT->csrMat->row_offsets->data().get(), 1112 loTriFactorT->csrMat->column_indices->data().get(), 1113 loTriFactorT->solveInfo, 1114 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1115 1116 /* restore */ 1117 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1118 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1119 ierr = WaitForGPU();CHKERRCUDA(ierr); 1120 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1121 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1122 PetscFunctionReturn(0); 1123 } 1124 1125 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1126 { 1127 const PetscScalar *barray; 1128 PetscScalar *xarray; 1129 thrust::device_ptr<const PetscScalar> bGPU; 1130 thrust::device_ptr<PetscScalar> xGPU; 1131 cusparseStatus_t stat; 1132 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1133 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1134 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1135 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1136 PetscErrorCode ierr; 1137 1138 PetscFunctionBegin; 1139 1140 /* Get the GPU pointers */ 1141 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1142 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1143 xGPU = thrust::device_pointer_cast(xarray); 1144 bGPU = thrust::device_pointer_cast(barray); 1145 1146 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1147 /* First, reorder with the row permutation */ 1148 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1149 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1150 tempGPU->begin()); 1151 1152 /* Next, solve L */ 1153 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1154 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1155 loTriFactor->csrMat->values->data().get(), 1156 loTriFactor->csrMat->row_offsets->data().get(), 1157 loTriFactor->csrMat->column_indices->data().get(), 1158 loTriFactor->solveInfo, 1159 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1160 1161 /* Then, solve U */ 1162 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1163 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1164 upTriFactor->csrMat->values->data().get(), 1165 upTriFactor->csrMat->row_offsets->data().get(), 1166 upTriFactor->csrMat->column_indices->data().get(), 1167 upTriFactor->solveInfo, 1168 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1169 1170 /* Last, reorder with the column permutation */ 1171 thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1172 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1173 xGPU); 1174 1175 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1176 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1177 ierr = WaitForGPU();CHKERRCUDA(ierr); 1178 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1179 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1180 PetscFunctionReturn(0); 1181 } 1182 1183 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1184 { 1185 const PetscScalar *barray; 1186 PetscScalar *xarray; 1187 cusparseStatus_t stat; 1188 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1189 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1190 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1191 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1192 PetscErrorCode ierr; 1193 1194 PetscFunctionBegin; 1195 /* Get the GPU pointers */ 1196 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1197 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1198 1199 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1200 /* First, solve L */ 1201 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1202 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1203 loTriFactor->csrMat->values->data().get(), 1204 loTriFactor->csrMat->row_offsets->data().get(), 1205 loTriFactor->csrMat->column_indices->data().get(), 1206 loTriFactor->solveInfo, 1207 barray, tempGPU->data().get());CHKERRCUDA(stat); 1208 1209 /* Next, solve U */ 1210 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1211 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1212 upTriFactor->csrMat->values->data().get(), 1213 upTriFactor->csrMat->row_offsets->data().get(), 1214 upTriFactor->csrMat->column_indices->data().get(), 1215 upTriFactor->solveInfo, 1216 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1217 1218 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1219 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1220 ierr = WaitForGPU();CHKERRCUDA(ierr); 1221 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1222 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1223 PetscFunctionReturn(0); 1224 } 1225 1226 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1227 { 1228 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1229 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1230 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1231 PetscInt m = A->rmap->n,*ii,*ridx; 1232 PetscErrorCode ierr; 1233 cusparseStatus_t stat; 1234 cudaError_t err; 1235 1236 PetscFunctionBegin; 1237 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1238 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1239 if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1240 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1241 /* copy values only */ 1242 matrix->values->assign(a->a, a->a+a->nz); 1243 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1244 } else { 1245 MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format); 1246 try { 1247 cusparsestruct->nonzerorow=0; 1248 for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0); 1249 1250 if (a->compressedrow.use) { 1251 m = a->compressedrow.nrows; 1252 ii = a->compressedrow.i; 1253 ridx = a->compressedrow.rindex; 1254 } else { 1255 /* Forcing compressed row on the GPU */ 1256 int k=0; 1257 ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr); 1258 ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr); 1259 ii[0]=0; 1260 for (int j = 0; j<m; j++) { 1261 if ((a->i[j+1]-a->i[j])>0) { 1262 ii[k] = a->i[j]; 1263 ridx[k]= j; 1264 k++; 1265 } 1266 } 1267 ii[cusparsestruct->nonzerorow] = a->nz; 1268 m = cusparsestruct->nonzerorow; 1269 } 1270 1271 /* allocate space for the triangular factor information */ 1272 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1273 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat); 1274 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 1275 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 1276 1277 err = cudaMalloc((void **)&(matstruct->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 1278 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1279 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1280 err = cudaMemcpy(matstruct->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1281 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1282 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1283 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 1284 1285 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1286 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1287 /* set the matrix */ 1288 CsrMatrix *matrix= new CsrMatrix; 1289 matrix->num_rows = m; 1290 matrix->num_cols = A->cmap->n; 1291 matrix->num_entries = a->nz; 1292 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1293 matrix->row_offsets->assign(ii, ii + m+1); 1294 1295 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1296 matrix->column_indices->assign(a->j, a->j+a->nz); 1297 1298 matrix->values = new THRUSTARRAY(a->nz); 1299 matrix->values->assign(a->a, a->a+a->nz); 1300 1301 /* assign the pointer */ 1302 matstruct->mat = matrix; 1303 1304 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1305 CsrMatrix *matrix= new CsrMatrix; 1306 matrix->num_rows = m; 1307 matrix->num_cols = A->cmap->n; 1308 matrix->num_entries = a->nz; 1309 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1310 matrix->row_offsets->assign(ii, ii + m+1); 1311 1312 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1313 matrix->column_indices->assign(a->j, a->j+a->nz); 1314 1315 matrix->values = new THRUSTARRAY(a->nz); 1316 matrix->values->assign(a->a, a->a+a->nz); 1317 1318 cusparseHybMat_t hybMat; 1319 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 1320 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1321 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1322 stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1323 matstruct->descr, matrix->values->data().get(), 1324 matrix->row_offsets->data().get(), 1325 matrix->column_indices->data().get(), 1326 hybMat, 0, partition);CHKERRCUDA(stat); 1327 /* assign the pointer */ 1328 matstruct->mat = hybMat; 1329 1330 if (matrix) { 1331 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1332 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1333 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1334 delete (CsrMatrix*)matrix; 1335 } 1336 } 1337 1338 /* assign the compressed row indices */ 1339 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1340 matstruct->cprowIndices->assign(ridx,ridx+m); 1341 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+m*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1342 1343 /* assign the pointer */ 1344 cusparsestruct->mat = matstruct; 1345 1346 if (!a->compressedrow.use) { 1347 ierr = PetscFree(ii);CHKERRQ(ierr); 1348 ierr = PetscFree(ridx);CHKERRQ(ierr); 1349 } 1350 cusparsestruct->workVector = new THRUSTARRAY(m); 1351 } catch(char *ex) { 1352 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1353 } 1354 cusparsestruct->nonzerostate = A->nonzerostate; 1355 } 1356 ierr = WaitForGPU();CHKERRCUDA(ierr); 1357 A->offloadmask = PETSC_OFFLOAD_BOTH; 1358 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1359 } 1360 PetscFunctionReturn(0); 1361 } 1362 1363 struct VecCUDAPlusEquals 1364 { 1365 template <typename Tuple> 1366 __host__ __device__ 1367 void operator()(Tuple t) 1368 { 1369 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1370 } 1371 }; 1372 1373 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1374 { 1375 PetscErrorCode ierr; 1376 1377 PetscFunctionBegin; 1378 ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr); 1379 PetscFunctionReturn(0); 1380 } 1381 1382 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1383 { 1384 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1385 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1386 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1387 const PetscScalar *xarray; 1388 PetscScalar *yarray; 1389 PetscErrorCode ierr; 1390 cusparseStatus_t stat; 1391 1392 PetscFunctionBegin; 1393 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1394 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1395 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1396 if (!matstructT) { 1397 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1398 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1399 } 1400 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1401 ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1402 if (yy->map->n) { 1403 PetscInt n = yy->map->n; 1404 cudaError_t err; 1405 err = cudaMemset(yarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err); /* hack to fix numerical errors from reading output vector yy, apparently */ 1406 } 1407 1408 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1409 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1410 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1411 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1412 mat->num_rows, mat->num_cols, 1413 mat->num_entries, matstructT->alpha, matstructT->descr, 1414 mat->values->data().get(), mat->row_offsets->data().get(), 1415 mat->column_indices->data().get(), xarray, matstructT->beta_zero, 1416 yarray);CHKERRCUDA(stat); 1417 } else { 1418 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1419 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1420 matstructT->alpha, matstructT->descr, hybMat, 1421 xarray, matstructT->beta_zero, 1422 yarray);CHKERRCUDA(stat); 1423 } 1424 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1425 ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1426 ierr = WaitForGPU();CHKERRCUDA(ierr); 1427 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1428 ierr = PetscLogGpuFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1429 PetscFunctionReturn(0); 1430 } 1431 1432 1433 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1434 { 1435 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1436 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1437 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 1438 const PetscScalar *xarray; 1439 PetscScalar *zarray,*dptr,*beta; 1440 PetscErrorCode ierr; 1441 cusparseStatus_t stat; 1442 PetscBool cmpr; /* if the matrix has been compressed (zero rows) */ 1443 1444 PetscFunctionBegin; 1445 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1446 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1447 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1448 try { 1449 cmpr = (PetscBool)(cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n)); 1450 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1451 if (yy && !cmpr) { /* MatMultAdd with noncompressed storage -> need uptodate zz vector */ 1452 ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr); 1453 } else { 1454 ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1455 } 1456 dptr = cmpr ? zarray : cusparsestruct->workVector->data().get(); 1457 beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero; 1458 1459 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1460 /* csr_spmv is multiply add */ 1461 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1462 /* here we need to be careful to set the number of rows in the multiply to the 1463 number of compressed rows in the matrix ... which is equivalent to the 1464 size of the workVector */ 1465 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1466 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1467 mat->num_rows, mat->num_cols, 1468 mat->num_entries, matstruct->alpha, matstruct->descr, 1469 mat->values->data().get(), mat->row_offsets->data().get(), 1470 mat->column_indices->data().get(), xarray, beta, 1471 dptr);CHKERRCUDA(stat); 1472 } else { 1473 if (cusparsestruct->workVector->size()) { 1474 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1475 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1476 matstruct->alpha, matstruct->descr, hybMat, 1477 xarray, beta, 1478 dptr);CHKERRCUDA(stat); 1479 } 1480 } 1481 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1482 1483 if (yy) { 1484 if (dptr != zarray) { 1485 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1486 } else if (zz != yy) { 1487 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); 1488 } 1489 } else if (dptr != zarray) { 1490 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 1491 } 1492 /* scatter the data from the temporary into the full vector with a += operation */ 1493 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1494 if (dptr != zarray) { 1495 thrust::device_ptr<PetscScalar> zptr; 1496 1497 zptr = thrust::device_pointer_cast(zarray); 1498 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 1499 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1500 VecCUDAPlusEquals()); 1501 } 1502 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1503 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1504 if (yy && !cmpr) { 1505 ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr); 1506 } else { 1507 ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1508 } 1509 } catch(char *ex) { 1510 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1511 } 1512 ierr = WaitForGPU();CHKERRCUDA(ierr); 1513 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1514 PetscFunctionReturn(0); 1515 } 1516 1517 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1518 { 1519 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1520 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1521 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1522 const PetscScalar *xarray; 1523 PetscScalar *zarray,*dptr,*beta; 1524 PetscErrorCode ierr; 1525 cusparseStatus_t stat; 1526 1527 PetscFunctionBegin; 1528 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1529 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1530 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1531 if (!matstructT) { 1532 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1533 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1534 } 1535 1536 try { 1537 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1538 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1539 ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr); 1540 dptr = cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->cmap->n) ? zarray : cusparsestruct->workVector->data().get(); 1541 beta = (yy == zz && dptr == zarray) ? matstructT->beta_one : matstructT->beta_zero; 1542 1543 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1544 /* multiply add with matrix transpose */ 1545 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1546 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1547 /* here we need to be careful to set the number of rows in the multiply to the 1548 number of compressed rows in the matrix ... which is equivalent to the 1549 size of the workVector */ 1550 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1551 mat->num_rows, mat->num_cols, 1552 mat->num_entries, matstructT->alpha, matstructT->descr, 1553 mat->values->data().get(), mat->row_offsets->data().get(), 1554 mat->column_indices->data().get(), xarray, beta, 1555 dptr);CHKERRCUDA(stat); 1556 } else { 1557 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1558 if (cusparsestruct->workVector->size()) { 1559 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1560 matstructT->alpha, matstructT->descr, hybMat, 1561 xarray, beta, 1562 dptr);CHKERRCUDA(stat); 1563 } 1564 } 1565 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1566 1567 if (dptr != zarray) { 1568 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1569 } else if (zz != yy) { 1570 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); 1571 } 1572 /* scatter the data from the temporary into the full vector with a += operation */ 1573 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1574 if (dptr != zarray) { 1575 thrust::device_ptr<PetscScalar> zptr; 1576 1577 zptr = thrust::device_pointer_cast(zarray); 1578 1579 /* scatter the data from the temporary into the full vector with a += operation */ 1580 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))), 1581 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n, 1582 VecCUDAPlusEquals()); 1583 } 1584 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1585 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1586 ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr); 1587 } catch(char *ex) { 1588 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1589 } 1590 ierr = WaitForGPU();CHKERRCUDA(ierr); /* is this needed? just for yy==0 in Mult */ 1591 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1592 PetscFunctionReturn(0); 1593 } 1594 1595 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1596 { 1597 PetscErrorCode ierr; 1598 1599 PetscFunctionBegin; 1600 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1601 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1602 if (A->factortype == MAT_FACTOR_NONE) { 1603 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1604 } 1605 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1606 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1607 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1608 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1609 PetscFunctionReturn(0); 1610 } 1611 1612 /* --------------------------------------------------------------------------------*/ 1613 /*@ 1614 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1615 (the default parallel PETSc format). This matrix will ultimately pushed down 1616 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1617 assembly performance the user should preallocate the matrix storage by setting 1618 the parameter nz (or the array nnz). By setting these parameters accurately, 1619 performance during matrix assembly can be increased by more than a factor of 50. 1620 1621 Collective 1622 1623 Input Parameters: 1624 + comm - MPI communicator, set to PETSC_COMM_SELF 1625 . m - number of rows 1626 . n - number of columns 1627 . nz - number of nonzeros per row (same for all rows) 1628 - nnz - array containing the number of nonzeros in the various rows 1629 (possibly different for each row) or NULL 1630 1631 Output Parameter: 1632 . A - the matrix 1633 1634 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1635 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1636 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1637 1638 Notes: 1639 If nnz is given then nz is ignored 1640 1641 The AIJ format (also called the Yale sparse matrix format or 1642 compressed row storage), is fully compatible with standard Fortran 77 1643 storage. That is, the stored row and column indices can begin at 1644 either one (as in Fortran) or zero. See the users' manual for details. 1645 1646 Specify the preallocated storage with either nz or nnz (not both). 1647 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1648 allocation. For large problems you MUST preallocate memory or you 1649 will get TERRIBLE performance, see the users' manual chapter on matrices. 1650 1651 By default, this format uses inodes (identical nodes) when possible, to 1652 improve numerical efficiency of matrix-vector products and solves. We 1653 search for consecutive rows with the same nonzero structure, thereby 1654 reusing matrix information to achieve increased efficiency. 1655 1656 Level: intermediate 1657 1658 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1659 @*/ 1660 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1661 { 1662 PetscErrorCode ierr; 1663 1664 PetscFunctionBegin; 1665 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1666 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1667 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1668 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1669 PetscFunctionReturn(0); 1670 } 1671 1672 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1673 { 1674 PetscErrorCode ierr; 1675 1676 PetscFunctionBegin; 1677 if (A->factortype==MAT_FACTOR_NONE) { 1678 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) { 1679 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 1680 } 1681 } else { 1682 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 1683 } 1684 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1685 PetscFunctionReturn(0); 1686 } 1687 1688 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 1689 { 1690 PetscErrorCode ierr; 1691 Mat C; 1692 cusparseStatus_t stat; 1693 cusparseHandle_t handle=0; 1694 1695 PetscFunctionBegin; 1696 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 1697 C = *B; 1698 ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr); 1699 ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr); 1700 1701 /* inject CUSPARSE-specific stuff */ 1702 if (C->factortype==MAT_FACTOR_NONE) { 1703 /* you cannot check the inode.use flag here since the matrix was just created. 1704 now build a GPU matrix data structure */ 1705 C->spptr = new Mat_SeqAIJCUSPARSE; 1706 ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat = 0; 1707 ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0; 1708 ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector = 0; 1709 ((Mat_SeqAIJCUSPARSE*)C->spptr)->format = MAT_CUSPARSE_CSR; 1710 ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream = 0; 1711 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1712 ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle = handle; 1713 ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0; 1714 } else { 1715 /* NEXT, set the pointers to the triangular factors */ 1716 C->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1717 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr = 0; 1718 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr = 0; 1719 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0; 1720 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0; 1721 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices = 0; 1722 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices = 0; 1723 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector = 0; 1724 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = 0; 1725 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1726 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = handle; 1727 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz = 0; 1728 } 1729 1730 C->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1731 C->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1732 C->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1733 C->ops->mult = MatMult_SeqAIJCUSPARSE; 1734 C->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1735 C->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1736 C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1737 C->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1738 1739 ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1740 1741 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 1742 1743 ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1744 PetscFunctionReturn(0); 1745 } 1746 1747 PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B) 1748 { 1749 PetscErrorCode ierr; 1750 cusparseStatus_t stat; 1751 cusparseHandle_t handle=0; 1752 1753 PetscFunctionBegin; 1754 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 1755 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 1756 1757 if (B->factortype==MAT_FACTOR_NONE) { 1758 /* you cannot check the inode.use flag here since the matrix was just created. 1759 now build a GPU matrix data structure */ 1760 B->spptr = new Mat_SeqAIJCUSPARSE; 1761 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1762 ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0; 1763 ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0; 1764 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1765 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1766 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1767 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle; 1768 ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0; 1769 } else { 1770 /* NEXT, set the pointers to the triangular factors */ 1771 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1772 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 1773 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1774 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0; 1775 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0; 1776 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0; 1777 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0; 1778 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0; 1779 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1780 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle; 1781 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0; 1782 } 1783 1784 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1785 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1786 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1787 B->ops->mult = MatMult_SeqAIJCUSPARSE; 1788 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1789 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1790 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1791 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1792 1793 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1794 1795 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 1796 1797 ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1798 PetscFunctionReturn(0); 1799 } 1800 1801 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 1802 { 1803 PetscErrorCode ierr; 1804 1805 PetscFunctionBegin; 1806 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 1807 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);CHKERRQ(ierr); 1808 PetscFunctionReturn(0); 1809 } 1810 1811 /*MC 1812 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1813 1814 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1815 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 1816 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 1817 1818 Options Database Keys: 1819 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1820 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1821 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1822 1823 Level: beginner 1824 1825 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1826 M*/ 1827 1828 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 1829 1830 1831 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 1832 { 1833 PetscErrorCode ierr; 1834 1835 PetscFunctionBegin; 1836 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1837 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1838 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1839 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1840 PetscFunctionReturn(0); 1841 } 1842 1843 1844 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 1845 { 1846 cusparseStatus_t stat; 1847 cusparseHandle_t handle; 1848 1849 PetscFunctionBegin; 1850 if (*cusparsestruct) { 1851 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format); 1852 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format); 1853 delete (*cusparsestruct)->workVector; 1854 if (handle = (*cusparsestruct)->handle) { 1855 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1856 } 1857 delete *cusparsestruct; 1858 *cusparsestruct = 0; 1859 } 1860 PetscFunctionReturn(0); 1861 } 1862 1863 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 1864 { 1865 PetscFunctionBegin; 1866 if (*mat) { 1867 delete (*mat)->values; 1868 delete (*mat)->column_indices; 1869 delete (*mat)->row_offsets; 1870 delete *mat; 1871 *mat = 0; 1872 } 1873 PetscFunctionReturn(0); 1874 } 1875 1876 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 1877 { 1878 cusparseStatus_t stat; 1879 PetscErrorCode ierr; 1880 1881 PetscFunctionBegin; 1882 if (*trifactor) { 1883 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); } 1884 if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); } 1885 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 1886 delete *trifactor; 1887 *trifactor = 0; 1888 } 1889 PetscFunctionReturn(0); 1890 } 1891 1892 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 1893 { 1894 CsrMatrix *mat; 1895 cusparseStatus_t stat; 1896 cudaError_t err; 1897 1898 PetscFunctionBegin; 1899 if (*matstruct) { 1900 if ((*matstruct)->mat) { 1901 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 1902 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 1903 stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat); 1904 } else { 1905 mat = (CsrMatrix*)(*matstruct)->mat; 1906 CsrMatrix_Destroy(&mat); 1907 } 1908 } 1909 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); } 1910 delete (*matstruct)->cprowIndices; 1911 if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); } 1912 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 1913 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 1914 delete *matstruct; 1915 *matstruct = 0; 1916 } 1917 PetscFunctionReturn(0); 1918 } 1919 1920 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 1921 { 1922 cusparseHandle_t handle; 1923 cusparseStatus_t stat; 1924 1925 PetscFunctionBegin; 1926 if (*trifactors) { 1927 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr); 1928 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr); 1929 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose); 1930 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose); 1931 delete (*trifactors)->rpermIndices; 1932 delete (*trifactors)->cpermIndices; 1933 delete (*trifactors)->workVector; 1934 if (handle = (*trifactors)->handle) { 1935 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1936 } 1937 delete *trifactors; 1938 *trifactors = 0; 1939 } 1940 PetscFunctionReturn(0); 1941 } 1942 1943