1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_CXX_COMPLEX_FIX 7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 8 9 #include <petscconf.h> 10 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11 #include <../src/mat/impls/sbaij/seq/sbaij.h> 12 #include <../src/vec/vec/impls/dvecimpl.h> 13 #include <petsc/private/vecimpl.h> 14 #undef VecType 15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 19 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 20 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 21 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 22 23 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 24 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 25 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 26 27 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 28 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 29 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 30 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 31 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 32 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 33 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 34 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 35 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 36 37 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 38 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 39 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 40 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 41 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 42 43 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 44 { 45 cusparseStatus_t stat; 46 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 47 48 PetscFunctionBegin; 49 cusparsestruct->stream = stream; 50 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat); 51 PetscFunctionReturn(0); 52 } 53 54 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 55 { 56 cusparseStatus_t stat; 57 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 58 59 PetscFunctionBegin; 60 if (cusparsestruct->handle != handle) { 61 if (cusparsestruct->handle) { 62 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat); 63 } 64 cusparsestruct->handle = handle; 65 } 66 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 67 PetscFunctionReturn(0); 68 } 69 70 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 71 { 72 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 73 PetscFunctionBegin; 74 if (cusparsestruct->handle) 75 cusparsestruct->handle = 0; 76 PetscFunctionReturn(0); 77 } 78 79 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 80 { 81 PetscFunctionBegin; 82 *type = MATSOLVERCUSPARSE; 83 PetscFunctionReturn(0); 84 } 85 86 /*MC 87 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 88 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 89 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 90 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 91 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 92 algorithms are not recommended. This class does NOT support direct solver operations. 93 94 Level: beginner 95 96 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 97 M*/ 98 99 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 100 { 101 PetscErrorCode ierr; 102 PetscInt n = A->rmap->n; 103 104 PetscFunctionBegin; 105 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 106 (*B)->factortype = ftype; 107 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 108 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 109 110 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 111 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 112 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 113 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 114 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 115 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 116 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 117 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 118 119 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 120 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 121 PetscFunctionReturn(0); 122 } 123 124 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 125 { 126 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 127 128 PetscFunctionBegin; 129 switch (op) { 130 case MAT_CUSPARSE_MULT: 131 cusparsestruct->format = format; 132 break; 133 case MAT_CUSPARSE_ALL: 134 cusparsestruct->format = format; 135 break; 136 default: 137 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 138 } 139 PetscFunctionReturn(0); 140 } 141 142 /*@ 143 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 144 operation. Only the MatMult operation can use different GPU storage formats 145 for MPIAIJCUSPARSE matrices. 146 Not Collective 147 148 Input Parameters: 149 + A - Matrix of type SEQAIJCUSPARSE 150 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 151 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 152 153 Output Parameter: 154 155 Level: intermediate 156 157 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 158 @*/ 159 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 160 { 161 PetscErrorCode ierr; 162 163 PetscFunctionBegin; 164 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 165 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 166 PetscFunctionReturn(0); 167 } 168 169 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 170 { 171 PetscErrorCode ierr; 172 MatCUSPARSEStorageFormat format; 173 PetscBool flg; 174 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 175 176 PetscFunctionBegin; 177 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 178 if (A->factortype==MAT_FACTOR_NONE) { 179 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 180 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 181 if (flg) { 182 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 183 } 184 } 185 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 186 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 187 if (flg) { 188 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 189 } 190 ierr = PetscOptionsTail();CHKERRQ(ierr); 191 PetscFunctionReturn(0); 192 193 } 194 195 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 196 { 197 PetscErrorCode ierr; 198 199 PetscFunctionBegin; 200 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 201 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 202 PetscFunctionReturn(0); 203 } 204 205 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 206 { 207 PetscErrorCode ierr; 208 209 PetscFunctionBegin; 210 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 211 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 212 PetscFunctionReturn(0); 213 } 214 215 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 216 { 217 PetscErrorCode ierr; 218 219 PetscFunctionBegin; 220 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 221 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 222 PetscFunctionReturn(0); 223 } 224 225 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 226 { 227 PetscErrorCode ierr; 228 229 PetscFunctionBegin; 230 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 231 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 232 PetscFunctionReturn(0); 233 } 234 235 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 236 { 237 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 238 PetscInt n = A->rmap->n; 239 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 240 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 241 cusparseStatus_t stat; 242 const PetscInt *ai = a->i,*aj = a->j,*vi; 243 const MatScalar *aa = a->a,*v; 244 PetscInt *AiLo, *AjLo; 245 PetscScalar *AALo; 246 PetscInt i,nz, nzLower, offset, rowOffset; 247 PetscErrorCode ierr; 248 249 PetscFunctionBegin; 250 if (!n) PetscFunctionReturn(0); 251 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 252 try { 253 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 254 nzLower=n+ai[n]-ai[1]; 255 256 /* Allocate Space for the lower triangular matrix */ 257 ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 258 ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr); 259 ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr); 260 261 /* Fill the lower triangular matrix */ 262 AiLo[0] = (PetscInt) 0; 263 AiLo[n] = nzLower; 264 AjLo[0] = (PetscInt) 0; 265 AALo[0] = (MatScalar) 1.0; 266 v = aa; 267 vi = aj; 268 offset = 1; 269 rowOffset= 1; 270 for (i=1; i<n; i++) { 271 nz = ai[i+1] - ai[i]; 272 /* additional 1 for the term on the diagonal */ 273 AiLo[i] = rowOffset; 274 rowOffset += nz+1; 275 276 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 277 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 278 279 offset += nz; 280 AjLo[offset] = (PetscInt) i; 281 AALo[offset] = (MatScalar) 1.0; 282 offset += 1; 283 284 v += nz; 285 vi += nz; 286 } 287 288 /* allocate space for the triangular factor information */ 289 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 290 291 /* Create the matrix description */ 292 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 293 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 294 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 295 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat); 296 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 297 298 /* Create the solve analysis information */ 299 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 300 301 /* set the operation */ 302 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 303 304 /* set the matrix */ 305 loTriFactor->csrMat = new CsrMatrix; 306 loTriFactor->csrMat->num_rows = n; 307 loTriFactor->csrMat->num_cols = n; 308 loTriFactor->csrMat->num_entries = nzLower; 309 310 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 311 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 312 313 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 314 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 315 316 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 317 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 318 319 /* perform the solve analysis */ 320 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 321 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 322 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 323 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 324 325 /* assign the pointer. Is this really necessary? */ 326 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 327 328 ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr); 329 ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr); 330 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 331 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 332 } catch(char *ex) { 333 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 334 } 335 } 336 PetscFunctionReturn(0); 337 } 338 339 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 340 { 341 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 342 PetscInt n = A->rmap->n; 343 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 344 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 345 cusparseStatus_t stat; 346 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 347 const MatScalar *aa = a->a,*v; 348 PetscInt *AiUp, *AjUp; 349 PetscScalar *AAUp; 350 PetscInt i,nz, nzUpper, offset; 351 PetscErrorCode ierr; 352 353 PetscFunctionBegin; 354 if (!n) PetscFunctionReturn(0); 355 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 356 try { 357 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 358 nzUpper = adiag[0]-adiag[n]; 359 360 /* Allocate Space for the upper triangular matrix */ 361 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 362 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 363 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 364 365 /* Fill the upper triangular matrix */ 366 AiUp[0]=(PetscInt) 0; 367 AiUp[n]=nzUpper; 368 offset = nzUpper; 369 for (i=n-1; i>=0; i--) { 370 v = aa + adiag[i+1] + 1; 371 vi = aj + adiag[i+1] + 1; 372 373 /* number of elements NOT on the diagonal */ 374 nz = adiag[i] - adiag[i+1]-1; 375 376 /* decrement the offset */ 377 offset -= (nz+1); 378 379 /* first, set the diagonal elements */ 380 AjUp[offset] = (PetscInt) i; 381 AAUp[offset] = (MatScalar)1./v[nz]; 382 AiUp[i] = AiUp[i+1] - (nz+1); 383 384 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 385 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 386 } 387 388 /* allocate space for the triangular factor information */ 389 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 390 391 /* Create the matrix description */ 392 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 393 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 394 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 395 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 396 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 397 398 /* Create the solve analysis information */ 399 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 400 401 /* set the operation */ 402 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 403 404 /* set the matrix */ 405 upTriFactor->csrMat = new CsrMatrix; 406 upTriFactor->csrMat->num_rows = n; 407 upTriFactor->csrMat->num_cols = n; 408 upTriFactor->csrMat->num_entries = nzUpper; 409 410 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 411 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 412 413 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 414 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 415 416 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 417 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 418 419 /* perform the solve analysis */ 420 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 421 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 422 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 423 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 424 425 /* assign the pointer. Is this really necessary? */ 426 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 427 428 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 429 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 430 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 431 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 432 } catch(char *ex) { 433 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 434 } 435 } 436 PetscFunctionReturn(0); 437 } 438 439 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 440 { 441 PetscErrorCode ierr; 442 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 443 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 444 IS isrow = a->row,iscol = a->icol; 445 PetscBool row_identity,col_identity; 446 const PetscInt *r,*c; 447 PetscInt n = A->rmap->n; 448 449 PetscFunctionBegin; 450 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 451 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 452 453 cusparseTriFactors->workVector = new THRUSTARRAY(n); 454 cusparseTriFactors->nnz=a->nz; 455 456 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 457 /* lower triangular indices */ 458 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 459 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 460 if (!row_identity) { 461 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 462 cusparseTriFactors->rpermIndices->assign(r, r+n); 463 } 464 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 465 466 /* upper triangular indices */ 467 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 468 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 469 if (!col_identity) { 470 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 471 cusparseTriFactors->cpermIndices->assign(c, c+n); 472 } 473 474 if (!row_identity && !col_identity) { 475 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 476 } else if(!row_identity) { 477 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 478 } else if(!col_identity) { 479 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 480 } 481 482 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 483 PetscFunctionReturn(0); 484 } 485 486 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 487 { 488 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 489 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 490 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 491 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 492 cusparseStatus_t stat; 493 PetscErrorCode ierr; 494 PetscInt *AiUp, *AjUp; 495 PetscScalar *AAUp; 496 PetscScalar *AALo; 497 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 498 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 499 const PetscInt *ai = b->i,*aj = b->j,*vj; 500 const MatScalar *aa = b->a,*v; 501 502 PetscFunctionBegin; 503 if (!n) PetscFunctionReturn(0); 504 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 505 try { 506 /* Allocate Space for the upper triangular matrix */ 507 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 508 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 509 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 510 ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 511 512 /* Fill the upper triangular matrix */ 513 AiUp[0]=(PetscInt) 0; 514 AiUp[n]=nzUpper; 515 offset = 0; 516 for (i=0; i<n; i++) { 517 /* set the pointers */ 518 v = aa + ai[i]; 519 vj = aj + ai[i]; 520 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 521 522 /* first, set the diagonal elements */ 523 AjUp[offset] = (PetscInt) i; 524 AAUp[offset] = (MatScalar)1.0/v[nz]; 525 AiUp[i] = offset; 526 AALo[offset] = (MatScalar)1.0/v[nz]; 527 528 offset+=1; 529 if (nz>0) { 530 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 531 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 532 for (j=offset; j<offset+nz; j++) { 533 AAUp[j] = -AAUp[j]; 534 AALo[j] = AAUp[j]/v[nz]; 535 } 536 offset+=nz; 537 } 538 } 539 540 /* allocate space for the triangular factor information */ 541 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 542 543 /* Create the matrix description */ 544 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 545 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 546 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 547 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 548 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 549 550 /* Create the solve analysis information */ 551 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 552 553 /* set the operation */ 554 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 555 556 /* set the matrix */ 557 upTriFactor->csrMat = new CsrMatrix; 558 upTriFactor->csrMat->num_rows = A->rmap->n; 559 upTriFactor->csrMat->num_cols = A->cmap->n; 560 upTriFactor->csrMat->num_entries = a->nz; 561 562 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 563 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 564 565 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 566 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 567 568 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 569 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 570 571 /* perform the solve analysis */ 572 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 573 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 574 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 575 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 576 577 /* assign the pointer. Is this really necessary? */ 578 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 579 580 /* allocate space for the triangular factor information */ 581 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 582 583 /* Create the matrix description */ 584 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 585 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 586 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 587 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 588 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 589 590 /* Create the solve analysis information */ 591 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 592 593 /* set the operation */ 594 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 595 596 /* set the matrix */ 597 loTriFactor->csrMat = new CsrMatrix; 598 loTriFactor->csrMat->num_rows = A->rmap->n; 599 loTriFactor->csrMat->num_cols = A->cmap->n; 600 loTriFactor->csrMat->num_entries = a->nz; 601 602 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 603 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 604 605 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 606 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 607 608 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 609 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 610 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 611 612 /* perform the solve analysis */ 613 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 614 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 615 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 616 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 617 618 /* assign the pointer. Is this really necessary? */ 619 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 620 621 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 622 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 623 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 624 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 625 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 626 } catch(char *ex) { 627 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 628 } 629 } 630 PetscFunctionReturn(0); 631 } 632 633 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 634 { 635 PetscErrorCode ierr; 636 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 637 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 638 IS ip = a->row; 639 const PetscInt *rip; 640 PetscBool perm_identity; 641 PetscInt n = A->rmap->n; 642 643 PetscFunctionBegin; 644 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 645 cusparseTriFactors->workVector = new THRUSTARRAY(n); 646 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 647 648 /* lower triangular indices */ 649 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 650 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 651 if (!perm_identity) { 652 IS iip; 653 const PetscInt *irip; 654 655 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 656 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 657 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 658 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 659 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 660 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 661 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 662 ierr = ISDestroy(&iip);CHKERRQ(ierr); 663 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 664 } 665 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 666 PetscFunctionReturn(0); 667 } 668 669 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 670 { 671 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 672 IS isrow = b->row,iscol = b->col; 673 PetscBool row_identity,col_identity; 674 PetscErrorCode ierr; 675 676 PetscFunctionBegin; 677 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 678 /* determine which version of MatSolve needs to be used. */ 679 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 680 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 681 if (row_identity && col_identity) { 682 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 683 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 684 B->ops->matsolve = NULL; 685 B->ops->matsolvetranspose = NULL; 686 } else { 687 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 688 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 689 B->ops->matsolve = NULL; 690 B->ops->matsolvetranspose = NULL; 691 } 692 693 /* get the triangular factors */ 694 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 695 PetscFunctionReturn(0); 696 } 697 698 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 699 { 700 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 701 IS ip = b->row; 702 PetscBool perm_identity; 703 PetscErrorCode ierr; 704 705 PetscFunctionBegin; 706 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 707 708 /* determine which version of MatSolve needs to be used. */ 709 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 710 if (perm_identity) { 711 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 712 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 713 B->ops->matsolve = NULL; 714 B->ops->matsolvetranspose = NULL; 715 } else { 716 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 717 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 718 B->ops->matsolve = NULL; 719 B->ops->matsolvetranspose = NULL; 720 } 721 722 /* get the triangular factors */ 723 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 724 PetscFunctionReturn(0); 725 } 726 727 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 728 { 729 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 730 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 731 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 732 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 733 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 734 cusparseStatus_t stat; 735 cusparseIndexBase_t indexBase; 736 cusparseMatrixType_t matrixType; 737 cusparseFillMode_t fillMode; 738 cusparseDiagType_t diagType; 739 740 PetscFunctionBegin; 741 742 /*********************************************/ 743 /* Now the Transpose of the Lower Tri Factor */ 744 /*********************************************/ 745 746 /* allocate space for the transpose of the lower triangular factor */ 747 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 748 749 /* set the matrix descriptors of the lower triangular factor */ 750 matrixType = cusparseGetMatType(loTriFactor->descr); 751 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 752 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 753 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 754 diagType = cusparseGetMatDiagType(loTriFactor->descr); 755 756 /* Create the matrix description */ 757 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat); 758 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat); 759 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat); 760 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat); 761 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat); 762 763 /* Create the solve analysis information */ 764 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat); 765 766 /* set the operation */ 767 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 768 769 /* allocate GPU space for the CSC of the lower triangular factor*/ 770 loTriFactorT->csrMat = new CsrMatrix; 771 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 772 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 773 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 774 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 775 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 776 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 777 778 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 779 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 780 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 781 loTriFactor->csrMat->values->data().get(), 782 loTriFactor->csrMat->row_offsets->data().get(), 783 loTriFactor->csrMat->column_indices->data().get(), 784 loTriFactorT->csrMat->values->data().get(), 785 loTriFactorT->csrMat->column_indices->data().get(), 786 loTriFactorT->csrMat->row_offsets->data().get(), 787 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 788 789 /* perform the solve analysis on the transposed matrix */ 790 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 791 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 792 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 793 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 794 loTriFactorT->solveInfo);CHKERRCUDA(stat); 795 796 /* assign the pointer. Is this really necessary? */ 797 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 798 799 /*********************************************/ 800 /* Now the Transpose of the Upper Tri Factor */ 801 /*********************************************/ 802 803 /* allocate space for the transpose of the upper triangular factor */ 804 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 805 806 /* set the matrix descriptors of the upper triangular factor */ 807 matrixType = cusparseGetMatType(upTriFactor->descr); 808 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 809 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 810 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 811 diagType = cusparseGetMatDiagType(upTriFactor->descr); 812 813 /* Create the matrix description */ 814 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat); 815 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat); 816 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat); 817 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat); 818 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat); 819 820 /* Create the solve analysis information */ 821 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat); 822 823 /* set the operation */ 824 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 825 826 /* allocate GPU space for the CSC of the upper triangular factor*/ 827 upTriFactorT->csrMat = new CsrMatrix; 828 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 829 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 830 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 831 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 832 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 833 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 834 835 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 836 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 837 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 838 upTriFactor->csrMat->values->data().get(), 839 upTriFactor->csrMat->row_offsets->data().get(), 840 upTriFactor->csrMat->column_indices->data().get(), 841 upTriFactorT->csrMat->values->data().get(), 842 upTriFactorT->csrMat->column_indices->data().get(), 843 upTriFactorT->csrMat->row_offsets->data().get(), 844 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 845 846 /* perform the solve analysis on the transposed matrix */ 847 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 848 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 849 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 850 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 851 upTriFactorT->solveInfo);CHKERRCUDA(stat); 852 853 /* assign the pointer. Is this really necessary? */ 854 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 855 PetscFunctionReturn(0); 856 } 857 858 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 859 { 860 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 861 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 862 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 863 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 864 cusparseStatus_t stat; 865 cusparseIndexBase_t indexBase; 866 cudaError_t err; 867 PetscErrorCode ierr; 868 869 PetscFunctionBegin; 870 871 /* allocate space for the triangular factor information */ 872 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 873 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat); 874 indexBase = cusparseGetMatIndexBase(matstruct->descr); 875 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat); 876 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 877 878 /* set alpha and beta */ 879 err = cudaMalloc((void **)&(matstructT->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 880 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 881 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 882 err = cudaMemcpy(matstructT->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 883 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 884 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 885 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 886 887 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 888 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 889 CsrMatrix *matrixT= new CsrMatrix; 890 thrust::device_vector<int> *rowoffsets_gpu; 891 const int ncomprow = matstruct->cprowIndices->size(); 892 thrust::host_vector<int> cprow(ncomprow); 893 cprow = *matstruct->cprowIndices; // GPU --> CPU 894 matrixT->num_rows = A->cmap->n; 895 matrixT->num_cols = A->rmap->n; 896 matrixT->num_entries = a->nz; 897 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 898 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 899 matrixT->values = new THRUSTARRAY(a->nz); 900 PetscInt k,i; 901 thrust::host_vector<int> rowst_host(A->rmap->n+1); 902 903 /* expand compress rows, which is forced in constructor (MatSeqAIJCUSPARSECopyToGPU) */ 904 rowst_host[0] = 0; 905 for (k = 0, i = 0; i < A->rmap->n ; i++) { 906 if (k < ncomprow && i==cprow[k]) { 907 rowst_host[i+1] = a->i[i+1]; 908 k++; 909 } else rowst_host[i+1] = rowst_host[i]; 910 } 911 if (k!=ncomprow) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"%D != %D",k,ncomprow); 912 rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 913 *rowoffsets_gpu = rowst_host; // CPU --> GPU 914 915 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 916 indexBase = cusparseGetMatIndexBase(matstruct->descr); 917 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 918 A->cmap->n, matrix->num_entries, 919 matrix->values->data().get(), 920 rowoffsets_gpu->data().get(), 921 matrix->column_indices->data().get(), 922 matrixT->values->data().get(), 923 matrixT->column_indices->data().get(), 924 matrixT->row_offsets->data().get(), 925 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 926 927 /* assign the pointer */ 928 matstructT->mat = matrixT; 929 ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 930 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 931 /* First convert HYB to CSR */ 932 CsrMatrix *temp= new CsrMatrix; 933 temp->num_rows = A->rmap->n; 934 temp->num_cols = A->cmap->n; 935 temp->num_entries = a->nz; 936 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 937 temp->column_indices = new THRUSTINTARRAY32(a->nz); 938 temp->values = new THRUSTARRAY(a->nz); 939 940 941 stat = cusparse_hyb2csr(cusparsestruct->handle, 942 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 943 temp->values->data().get(), 944 temp->row_offsets->data().get(), 945 temp->column_indices->data().get());CHKERRCUDA(stat); 946 947 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 948 CsrMatrix *tempT= new CsrMatrix; 949 tempT->num_rows = A->rmap->n; 950 tempT->num_cols = A->cmap->n; 951 tempT->num_entries = a->nz; 952 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 953 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 954 tempT->values = new THRUSTARRAY(a->nz); 955 956 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 957 temp->num_cols, temp->num_entries, 958 temp->values->data().get(), 959 temp->row_offsets->data().get(), 960 temp->column_indices->data().get(), 961 tempT->values->data().get(), 962 tempT->column_indices->data().get(), 963 tempT->row_offsets->data().get(), 964 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 965 966 /* Last, convert CSC to HYB */ 967 cusparseHybMat_t hybMat; 968 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 969 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 970 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 971 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 972 matstructT->descr, tempT->values->data().get(), 973 tempT->row_offsets->data().get(), 974 tempT->column_indices->data().get(), 975 hybMat, 0, partition);CHKERRCUDA(stat); 976 977 /* assign the pointer */ 978 matstructT->mat = hybMat; 979 ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr); 980 981 /* delete temporaries */ 982 if (tempT) { 983 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 984 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 985 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 986 delete (CsrMatrix*) tempT; 987 } 988 if (temp) { 989 if (temp->values) delete (THRUSTARRAY*) temp->values; 990 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 991 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 992 delete (CsrMatrix*) temp; 993 } 994 } 995 /* assign the compressed row indices */ 996 matstructT->cprowIndices = new THRUSTINTARRAY; 997 matstructT->cprowIndices->resize(A->cmap->n); 998 thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end()); 999 /* assign the pointer */ 1000 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1001 PetscFunctionReturn(0); 1002 } 1003 1004 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1005 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1006 { 1007 PetscInt n = xx->map->n; 1008 const PetscScalar *barray; 1009 PetscScalar *xarray; 1010 thrust::device_ptr<const PetscScalar> bGPU; 1011 thrust::device_ptr<PetscScalar> xGPU; 1012 cusparseStatus_t stat; 1013 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1014 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1015 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1016 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1017 PetscErrorCode ierr; 1018 1019 PetscFunctionBegin; 1020 /* Analyze the matrix and create the transpose ... on the fly */ 1021 if (!loTriFactorT && !upTriFactorT) { 1022 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1023 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1024 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1025 } 1026 1027 /* Get the GPU pointers */ 1028 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1029 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1030 xGPU = thrust::device_pointer_cast(xarray); 1031 bGPU = thrust::device_pointer_cast(barray); 1032 1033 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1034 /* First, reorder with the row permutation */ 1035 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1036 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1037 xGPU); 1038 1039 /* First, solve U */ 1040 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1041 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1042 upTriFactorT->csrMat->values->data().get(), 1043 upTriFactorT->csrMat->row_offsets->data().get(), 1044 upTriFactorT->csrMat->column_indices->data().get(), 1045 upTriFactorT->solveInfo, 1046 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1047 1048 /* Then, solve L */ 1049 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1050 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1051 loTriFactorT->csrMat->values->data().get(), 1052 loTriFactorT->csrMat->row_offsets->data().get(), 1053 loTriFactorT->csrMat->column_indices->data().get(), 1054 loTriFactorT->solveInfo, 1055 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1056 1057 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1058 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1059 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1060 tempGPU->begin()); 1061 1062 /* Copy the temporary to the full solution. */ 1063 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1064 1065 /* restore */ 1066 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1067 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1068 ierr = WaitForGPU();CHKERRCUDA(ierr); 1069 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1070 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1071 PetscFunctionReturn(0); 1072 } 1073 1074 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1075 { 1076 const PetscScalar *barray; 1077 PetscScalar *xarray; 1078 cusparseStatus_t stat; 1079 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1080 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1081 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1082 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1083 PetscErrorCode ierr; 1084 1085 PetscFunctionBegin; 1086 /* Analyze the matrix and create the transpose ... on the fly */ 1087 if (!loTriFactorT && !upTriFactorT) { 1088 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1089 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1090 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1091 } 1092 1093 /* Get the GPU pointers */ 1094 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1095 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1096 1097 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1098 /* First, solve U */ 1099 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1100 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1101 upTriFactorT->csrMat->values->data().get(), 1102 upTriFactorT->csrMat->row_offsets->data().get(), 1103 upTriFactorT->csrMat->column_indices->data().get(), 1104 upTriFactorT->solveInfo, 1105 barray, tempGPU->data().get());CHKERRCUDA(stat); 1106 1107 /* Then, solve L */ 1108 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1109 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1110 loTriFactorT->csrMat->values->data().get(), 1111 loTriFactorT->csrMat->row_offsets->data().get(), 1112 loTriFactorT->csrMat->column_indices->data().get(), 1113 loTriFactorT->solveInfo, 1114 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1115 1116 /* restore */ 1117 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1118 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1119 ierr = WaitForGPU();CHKERRCUDA(ierr); 1120 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1121 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1122 PetscFunctionReturn(0); 1123 } 1124 1125 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1126 { 1127 const PetscScalar *barray; 1128 PetscScalar *xarray; 1129 thrust::device_ptr<const PetscScalar> bGPU; 1130 thrust::device_ptr<PetscScalar> xGPU; 1131 cusparseStatus_t stat; 1132 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1133 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1134 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1135 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1136 PetscErrorCode ierr; 1137 1138 PetscFunctionBegin; 1139 1140 /* Get the GPU pointers */ 1141 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1142 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1143 xGPU = thrust::device_pointer_cast(xarray); 1144 bGPU = thrust::device_pointer_cast(barray); 1145 1146 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1147 /* First, reorder with the row permutation */ 1148 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1149 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1150 tempGPU->begin()); 1151 1152 /* Next, solve L */ 1153 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1154 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1155 loTriFactor->csrMat->values->data().get(), 1156 loTriFactor->csrMat->row_offsets->data().get(), 1157 loTriFactor->csrMat->column_indices->data().get(), 1158 loTriFactor->solveInfo, 1159 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1160 1161 /* Then, solve U */ 1162 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1163 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1164 upTriFactor->csrMat->values->data().get(), 1165 upTriFactor->csrMat->row_offsets->data().get(), 1166 upTriFactor->csrMat->column_indices->data().get(), 1167 upTriFactor->solveInfo, 1168 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1169 1170 /* Last, reorder with the column permutation */ 1171 thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1172 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1173 xGPU); 1174 1175 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1176 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1177 ierr = WaitForGPU();CHKERRCUDA(ierr); 1178 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1179 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1180 PetscFunctionReturn(0); 1181 } 1182 1183 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1184 { 1185 const PetscScalar *barray; 1186 PetscScalar *xarray; 1187 cusparseStatus_t stat; 1188 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1189 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1190 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1191 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1192 PetscErrorCode ierr; 1193 1194 PetscFunctionBegin; 1195 /* Get the GPU pointers */ 1196 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1197 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1198 1199 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1200 /* First, solve L */ 1201 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1202 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1203 loTriFactor->csrMat->values->data().get(), 1204 loTriFactor->csrMat->row_offsets->data().get(), 1205 loTriFactor->csrMat->column_indices->data().get(), 1206 loTriFactor->solveInfo, 1207 barray, tempGPU->data().get());CHKERRCUDA(stat); 1208 1209 /* Next, solve U */ 1210 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1211 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1212 upTriFactor->csrMat->values->data().get(), 1213 upTriFactor->csrMat->row_offsets->data().get(), 1214 upTriFactor->csrMat->column_indices->data().get(), 1215 upTriFactor->solveInfo, 1216 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1217 1218 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1219 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1220 ierr = WaitForGPU();CHKERRCUDA(ierr); 1221 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1222 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1223 PetscFunctionReturn(0); 1224 } 1225 1226 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1227 { 1228 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1229 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1230 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1231 PetscInt m = A->rmap->n,*ii,*ridx; 1232 PetscErrorCode ierr; 1233 cusparseStatus_t stat; 1234 cudaError_t err; 1235 1236 PetscFunctionBegin; 1237 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 1238 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1239 if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1240 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1241 /* copy values only */ 1242 matrix->values->assign(a->a, a->a+a->nz); 1243 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1244 } else { 1245 MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format); 1246 try { 1247 cusparsestruct->nonzerorow=0; 1248 for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0); 1249 1250 if (a->compressedrow.use) { 1251 m = a->compressedrow.nrows; 1252 ii = a->compressedrow.i; 1253 ridx = a->compressedrow.rindex; 1254 } else { 1255 /* Forcing compressed row on the GPU */ 1256 int k=0; 1257 ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr); 1258 ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr); 1259 ii[0]=0; 1260 for (int j = 0; j<m; j++) { 1261 if ((a->i[j+1]-a->i[j])>0) { 1262 ii[k] = a->i[j]; 1263 ridx[k]= j; 1264 k++; 1265 } 1266 } 1267 ii[cusparsestruct->nonzerorow] = a->nz; 1268 m = cusparsestruct->nonzerorow; 1269 } 1270 1271 /* allocate space for the triangular factor information */ 1272 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1273 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat); 1274 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 1275 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 1276 1277 err = cudaMalloc((void **)&(matstruct->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 1278 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1279 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1280 err = cudaMemcpy(matstruct->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1281 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1282 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1283 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 1284 1285 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1286 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1287 /* set the matrix */ 1288 CsrMatrix *matrix= new CsrMatrix; 1289 matrix->num_rows = m; 1290 matrix->num_cols = A->cmap->n; 1291 matrix->num_entries = a->nz; 1292 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1293 matrix->row_offsets->assign(ii, ii + m+1); 1294 1295 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1296 matrix->column_indices->assign(a->j, a->j+a->nz); 1297 1298 matrix->values = new THRUSTARRAY(a->nz); 1299 matrix->values->assign(a->a, a->a+a->nz); 1300 1301 /* assign the pointer */ 1302 matstruct->mat = matrix; 1303 1304 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1305 CsrMatrix *matrix= new CsrMatrix; 1306 matrix->num_rows = m; 1307 matrix->num_cols = A->cmap->n; 1308 matrix->num_entries = a->nz; 1309 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1310 matrix->row_offsets->assign(ii, ii + m+1); 1311 1312 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1313 matrix->column_indices->assign(a->j, a->j+a->nz); 1314 1315 matrix->values = new THRUSTARRAY(a->nz); 1316 matrix->values->assign(a->a, a->a+a->nz); 1317 1318 cusparseHybMat_t hybMat; 1319 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 1320 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1321 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1322 stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1323 matstruct->descr, matrix->values->data().get(), 1324 matrix->row_offsets->data().get(), 1325 matrix->column_indices->data().get(), 1326 hybMat, 0, partition);CHKERRCUDA(stat); 1327 /* assign the pointer */ 1328 matstruct->mat = hybMat; 1329 1330 if (matrix) { 1331 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1332 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1333 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1334 delete (CsrMatrix*)matrix; 1335 } 1336 } 1337 1338 /* assign the compressed row indices */ 1339 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1340 matstruct->cprowIndices->assign(ridx,ridx+m); 1341 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+m*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1342 1343 /* assign the pointer */ 1344 cusparsestruct->mat = matstruct; 1345 1346 if (!a->compressedrow.use) { 1347 ierr = PetscFree(ii);CHKERRQ(ierr); 1348 ierr = PetscFree(ridx);CHKERRQ(ierr); 1349 } 1350 cusparsestruct->workVector = new THRUSTARRAY(m); 1351 } catch(char *ex) { 1352 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1353 } 1354 cusparsestruct->nonzerostate = A->nonzerostate; 1355 } 1356 ierr = WaitForGPU();CHKERRCUDA(ierr); 1357 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 1358 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1359 } 1360 PetscFunctionReturn(0); 1361 } 1362 1363 struct VecCUDAPlusEquals 1364 { 1365 template <typename Tuple> 1366 __host__ __device__ 1367 void operator()(Tuple t) 1368 { 1369 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1370 } 1371 }; 1372 1373 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1374 { 1375 PetscErrorCode ierr; 1376 1377 PetscFunctionBegin; 1378 ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr); 1379 PetscFunctionReturn(0); 1380 } 1381 1382 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1383 { 1384 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1385 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1386 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1387 const PetscScalar *xarray; 1388 PetscScalar *yarray; 1389 PetscErrorCode ierr; 1390 cusparseStatus_t stat; 1391 1392 PetscFunctionBegin; 1393 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1394 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1395 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1396 if (!matstructT) { 1397 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1398 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1399 } 1400 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1401 ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1402 1403 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1404 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1405 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1406 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1407 mat->num_rows, mat->num_cols, 1408 mat->num_entries, matstructT->alpha, matstructT->descr, 1409 mat->values->data().get(), mat->row_offsets->data().get(), 1410 mat->column_indices->data().get(), xarray, matstructT->beta_zero, 1411 yarray);CHKERRCUDA(stat); 1412 } else { 1413 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1414 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1415 matstructT->alpha, matstructT->descr, hybMat, 1416 xarray, matstructT->beta_zero, 1417 yarray);CHKERRCUDA(stat); 1418 } 1419 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1420 ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1421 ierr = WaitForGPU();CHKERRCUDA(ierr); 1422 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1423 ierr = PetscLogGpuFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1424 PetscFunctionReturn(0); 1425 } 1426 1427 1428 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1429 { 1430 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1431 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1432 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 1433 const PetscScalar *xarray; 1434 PetscScalar *zarray,*dptr,*beta; 1435 PetscErrorCode ierr; 1436 cusparseStatus_t stat; 1437 PetscBool cmpr; /* if the matrix has been compressed (zero rows) */ 1438 1439 PetscFunctionBegin; 1440 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1441 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1442 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1443 try { 1444 cmpr = (PetscBool)(cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n)); 1445 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1446 if (yy && !cmpr) { /* MatMultAdd with noncompressed storage -> need uptodate zz vector */ 1447 ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr); 1448 } else { 1449 ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1450 } 1451 dptr = cmpr ? zarray : cusparsestruct->workVector->data().get(); 1452 beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero; 1453 1454 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1455 /* csr_spmv is multiply add */ 1456 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1457 /* here we need to be careful to set the number of rows in the multiply to the 1458 number of compressed rows in the matrix ... which is equivalent to the 1459 size of the workVector */ 1460 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1461 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1462 mat->num_rows, mat->num_cols, 1463 mat->num_entries, matstruct->alpha, matstruct->descr, 1464 mat->values->data().get(), mat->row_offsets->data().get(), 1465 mat->column_indices->data().get(), xarray, beta, 1466 dptr);CHKERRCUDA(stat); 1467 } else { 1468 if (cusparsestruct->workVector->size()) { 1469 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1470 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1471 matstruct->alpha, matstruct->descr, hybMat, 1472 xarray, beta, 1473 dptr);CHKERRCUDA(stat); 1474 } 1475 } 1476 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1477 1478 if (yy) { 1479 if (dptr != zarray) { 1480 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1481 } else if (zz != yy) { 1482 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); 1483 } 1484 } else if (dptr != zarray) { 1485 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 1486 } 1487 /* scatter the data from the temporary into the full vector with a += operation */ 1488 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1489 if (dptr != zarray) { 1490 thrust::device_ptr<PetscScalar> zptr; 1491 1492 zptr = thrust::device_pointer_cast(zarray); 1493 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 1494 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1495 VecCUDAPlusEquals()); 1496 } 1497 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1498 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1499 if (yy && !cmpr) { 1500 ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr); 1501 } else { 1502 ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1503 } 1504 } catch(char *ex) { 1505 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1506 } 1507 if (!yy) { /* MatMult */ 1508 if (!cusparsestruct->stream) { 1509 ierr = WaitForGPU();CHKERRCUDA(ierr); 1510 } 1511 } 1512 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1513 PetscFunctionReturn(0); 1514 } 1515 1516 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1517 { 1518 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1519 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1520 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1521 const PetscScalar *xarray; 1522 PetscScalar *zarray,*dptr,*beta; 1523 PetscErrorCode ierr; 1524 cusparseStatus_t stat; 1525 1526 PetscFunctionBegin; 1527 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1528 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1529 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1530 if (!matstructT) { 1531 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1532 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1533 } 1534 1535 try { 1536 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1537 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1538 ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr); 1539 dptr = cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->cmap->n) ? zarray : cusparsestruct->workVector->data().get(); 1540 beta = (yy == zz && dptr == zarray) ? matstructT->beta_one : matstructT->beta_zero; 1541 1542 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1543 /* multiply add with matrix transpose */ 1544 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1545 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1546 /* here we need to be careful to set the number of rows in the multiply to the 1547 number of compressed rows in the matrix ... which is equivalent to the 1548 size of the workVector */ 1549 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1550 mat->num_rows, mat->num_cols, 1551 mat->num_entries, matstructT->alpha, matstructT->descr, 1552 mat->values->data().get(), mat->row_offsets->data().get(), 1553 mat->column_indices->data().get(), xarray, beta, 1554 dptr);CHKERRCUDA(stat); 1555 } else { 1556 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1557 if (cusparsestruct->workVector->size()) { 1558 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1559 matstructT->alpha, matstructT->descr, hybMat, 1560 xarray, beta, 1561 dptr);CHKERRCUDA(stat); 1562 } 1563 } 1564 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1565 1566 if (dptr != zarray) { 1567 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1568 } else if (zz != yy) { 1569 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); 1570 } 1571 /* scatter the data from the temporary into the full vector with a += operation */ 1572 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1573 if (dptr != zarray) { 1574 thrust::device_ptr<PetscScalar> zptr; 1575 1576 zptr = thrust::device_pointer_cast(zarray); 1577 1578 /* scatter the data from the temporary into the full vector with a += operation */ 1579 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))), 1580 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n, 1581 VecCUDAPlusEquals()); 1582 } 1583 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1584 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1585 ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr); 1586 } catch(char *ex) { 1587 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1588 } 1589 ierr = WaitForGPU();CHKERRCUDA(ierr); /* is this needed? just for yy==0 in Mult */ 1590 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1591 PetscFunctionReturn(0); 1592 } 1593 1594 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1595 { 1596 PetscErrorCode ierr; 1597 1598 PetscFunctionBegin; 1599 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1600 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1601 if (A->factortype == MAT_FACTOR_NONE) { 1602 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1603 } 1604 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1605 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1606 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1607 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1608 PetscFunctionReturn(0); 1609 } 1610 1611 /* --------------------------------------------------------------------------------*/ 1612 /*@ 1613 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1614 (the default parallel PETSc format). This matrix will ultimately pushed down 1615 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1616 assembly performance the user should preallocate the matrix storage by setting 1617 the parameter nz (or the array nnz). By setting these parameters accurately, 1618 performance during matrix assembly can be increased by more than a factor of 50. 1619 1620 Collective 1621 1622 Input Parameters: 1623 + comm - MPI communicator, set to PETSC_COMM_SELF 1624 . m - number of rows 1625 . n - number of columns 1626 . nz - number of nonzeros per row (same for all rows) 1627 - nnz - array containing the number of nonzeros in the various rows 1628 (possibly different for each row) or NULL 1629 1630 Output Parameter: 1631 . A - the matrix 1632 1633 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1634 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1635 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1636 1637 Notes: 1638 If nnz is given then nz is ignored 1639 1640 The AIJ format (also called the Yale sparse matrix format or 1641 compressed row storage), is fully compatible with standard Fortran 77 1642 storage. That is, the stored row and column indices can begin at 1643 either one (as in Fortran) or zero. See the users' manual for details. 1644 1645 Specify the preallocated storage with either nz or nnz (not both). 1646 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1647 allocation. For large problems you MUST preallocate memory or you 1648 will get TERRIBLE performance, see the users' manual chapter on matrices. 1649 1650 By default, this format uses inodes (identical nodes) when possible, to 1651 improve numerical efficiency of matrix-vector products and solves. We 1652 search for consecutive rows with the same nonzero structure, thereby 1653 reusing matrix information to achieve increased efficiency. 1654 1655 Level: intermediate 1656 1657 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1658 @*/ 1659 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1660 { 1661 PetscErrorCode ierr; 1662 1663 PetscFunctionBegin; 1664 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1665 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1666 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1667 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1668 PetscFunctionReturn(0); 1669 } 1670 1671 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1672 { 1673 PetscErrorCode ierr; 1674 1675 PetscFunctionBegin; 1676 if (A->factortype==MAT_FACTOR_NONE) { 1677 if (A->valid_GPU_matrix != PETSC_OFFLOAD_UNALLOCATED) { 1678 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 1679 } 1680 } else { 1681 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 1682 } 1683 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1684 PetscFunctionReturn(0); 1685 } 1686 1687 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 1688 { 1689 PetscErrorCode ierr; 1690 Mat C; 1691 cusparseStatus_t stat; 1692 cusparseHandle_t handle=0; 1693 1694 PetscFunctionBegin; 1695 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 1696 C = *B; 1697 ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr); 1698 ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr); 1699 1700 /* inject CUSPARSE-specific stuff */ 1701 if (C->factortype==MAT_FACTOR_NONE) { 1702 /* you cannot check the inode.use flag here since the matrix was just created. 1703 now build a GPU matrix data structure */ 1704 C->spptr = new Mat_SeqAIJCUSPARSE; 1705 ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat = 0; 1706 ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0; 1707 ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector = 0; 1708 ((Mat_SeqAIJCUSPARSE*)C->spptr)->format = MAT_CUSPARSE_CSR; 1709 ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream = 0; 1710 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1711 ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle = handle; 1712 ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0; 1713 } else { 1714 /* NEXT, set the pointers to the triangular factors */ 1715 C->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1716 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr = 0; 1717 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr = 0; 1718 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0; 1719 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0; 1720 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices = 0; 1721 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices = 0; 1722 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector = 0; 1723 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = 0; 1724 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1725 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = handle; 1726 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz = 0; 1727 } 1728 1729 C->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1730 C->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1731 C->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1732 C->ops->mult = MatMult_SeqAIJCUSPARSE; 1733 C->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1734 C->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1735 C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1736 C->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1737 1738 ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1739 1740 C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED; 1741 1742 ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1743 PetscFunctionReturn(0); 1744 } 1745 1746 PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B) 1747 { 1748 PetscErrorCode ierr; 1749 cusparseStatus_t stat; 1750 cusparseHandle_t handle=0; 1751 1752 PetscFunctionBegin; 1753 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 1754 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 1755 1756 if (B->factortype==MAT_FACTOR_NONE) { 1757 /* you cannot check the inode.use flag here since the matrix was just created. 1758 now build a GPU matrix data structure */ 1759 B->spptr = new Mat_SeqAIJCUSPARSE; 1760 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1761 ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0; 1762 ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0; 1763 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1764 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1765 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1766 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle; 1767 ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0; 1768 } else { 1769 /* NEXT, set the pointers to the triangular factors */ 1770 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1771 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 1772 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1773 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0; 1774 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0; 1775 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0; 1776 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0; 1777 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0; 1778 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1779 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle; 1780 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0; 1781 } 1782 1783 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1784 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1785 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1786 B->ops->mult = MatMult_SeqAIJCUSPARSE; 1787 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1788 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1789 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1790 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1791 1792 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1793 1794 B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED; 1795 1796 ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1797 PetscFunctionReturn(0); 1798 } 1799 1800 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 1801 { 1802 PetscErrorCode ierr; 1803 1804 PetscFunctionBegin; 1805 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 1806 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);CHKERRQ(ierr); 1807 PetscFunctionReturn(0); 1808 } 1809 1810 /*MC 1811 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1812 1813 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1814 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 1815 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 1816 1817 Options Database Keys: 1818 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1819 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1820 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1821 1822 Level: beginner 1823 1824 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1825 M*/ 1826 1827 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 1828 1829 1830 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 1831 { 1832 PetscErrorCode ierr; 1833 1834 PetscFunctionBegin; 1835 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1836 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1837 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1838 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1839 PetscFunctionReturn(0); 1840 } 1841 1842 1843 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 1844 { 1845 cusparseStatus_t stat; 1846 cusparseHandle_t handle; 1847 1848 PetscFunctionBegin; 1849 if (*cusparsestruct) { 1850 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format); 1851 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format); 1852 delete (*cusparsestruct)->workVector; 1853 if (handle = (*cusparsestruct)->handle) { 1854 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1855 } 1856 delete *cusparsestruct; 1857 *cusparsestruct = 0; 1858 } 1859 PetscFunctionReturn(0); 1860 } 1861 1862 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 1863 { 1864 PetscFunctionBegin; 1865 if (*mat) { 1866 delete (*mat)->values; 1867 delete (*mat)->column_indices; 1868 delete (*mat)->row_offsets; 1869 delete *mat; 1870 *mat = 0; 1871 } 1872 PetscFunctionReturn(0); 1873 } 1874 1875 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 1876 { 1877 cusparseStatus_t stat; 1878 PetscErrorCode ierr; 1879 1880 PetscFunctionBegin; 1881 if (*trifactor) { 1882 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); } 1883 if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); } 1884 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 1885 delete *trifactor; 1886 *trifactor = 0; 1887 } 1888 PetscFunctionReturn(0); 1889 } 1890 1891 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 1892 { 1893 CsrMatrix *mat; 1894 cusparseStatus_t stat; 1895 cudaError_t err; 1896 1897 PetscFunctionBegin; 1898 if (*matstruct) { 1899 if ((*matstruct)->mat) { 1900 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 1901 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 1902 stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat); 1903 } else { 1904 mat = (CsrMatrix*)(*matstruct)->mat; 1905 CsrMatrix_Destroy(&mat); 1906 } 1907 } 1908 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); } 1909 delete (*matstruct)->cprowIndices; 1910 if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); } 1911 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 1912 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 1913 delete *matstruct; 1914 *matstruct = 0; 1915 } 1916 PetscFunctionReturn(0); 1917 } 1918 1919 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 1920 { 1921 cusparseHandle_t handle; 1922 cusparseStatus_t stat; 1923 1924 PetscFunctionBegin; 1925 if (*trifactors) { 1926 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr); 1927 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr); 1928 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose); 1929 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose); 1930 delete (*trifactors)->rpermIndices; 1931 delete (*trifactors)->cpermIndices; 1932 delete (*trifactors)->workVector; 1933 if (handle = (*trifactors)->handle) { 1934 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1935 } 1936 delete *trifactors; 1937 *trifactors = 0; 1938 } 1939 PetscFunctionReturn(0); 1940 } 1941 1942