1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_CXX_COMPLEX_FIX 7 8 #include <petscconf.h> 9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10 #include <../src/mat/impls/sbaij/seq/sbaij.h> 11 #include <../src/vec/vec/impls/dvecimpl.h> 12 #include <petsc/private/vecimpl.h> 13 #undef VecType 14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15 16 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 17 18 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 19 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 20 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 21 22 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 23 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 24 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 25 26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 27 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 29 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 30 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 31 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 32 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 33 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 34 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 35 36 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 37 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 38 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 39 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 40 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 41 42 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 43 { 44 cusparseStatus_t stat; 45 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 46 47 PetscFunctionBegin; 48 cusparsestruct->stream = stream; 49 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat); 50 PetscFunctionReturn(0); 51 } 52 53 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 54 { 55 cusparseStatus_t stat; 56 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 57 58 PetscFunctionBegin; 59 if (cusparsestruct->handle != handle) { 60 if (cusparsestruct->handle) { 61 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat); 62 } 63 cusparsestruct->handle = handle; 64 } 65 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 66 PetscFunctionReturn(0); 67 } 68 69 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 70 { 71 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 72 PetscFunctionBegin; 73 if (cusparsestruct->handle) 74 cusparsestruct->handle = 0; 75 PetscFunctionReturn(0); 76 } 77 78 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 79 { 80 PetscFunctionBegin; 81 *type = MATSOLVERCUSPARSE; 82 PetscFunctionReturn(0); 83 } 84 85 /*MC 86 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 87 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 88 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 89 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 90 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 91 algorithms are not recommended. This class does NOT support direct solver operations. 92 93 Level: beginner 94 95 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 96 M*/ 97 98 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 99 { 100 PetscErrorCode ierr; 101 PetscInt n = A->rmap->n; 102 103 PetscFunctionBegin; 104 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 105 (*B)->factortype = ftype; 106 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 107 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 108 109 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 110 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 111 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 112 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 113 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 114 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 115 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 116 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 117 118 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 119 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 120 PetscFunctionReturn(0); 121 } 122 123 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 124 { 125 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 126 127 PetscFunctionBegin; 128 switch (op) { 129 case MAT_CUSPARSE_MULT: 130 cusparsestruct->format = format; 131 break; 132 case MAT_CUSPARSE_ALL: 133 cusparsestruct->format = format; 134 break; 135 default: 136 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 137 } 138 PetscFunctionReturn(0); 139 } 140 141 /*@ 142 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 143 operation. Only the MatMult operation can use different GPU storage formats 144 for MPIAIJCUSPARSE matrices. 145 Not Collective 146 147 Input Parameters: 148 + A - Matrix of type SEQAIJCUSPARSE 149 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 150 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 151 152 Output Parameter: 153 154 Level: intermediate 155 156 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 157 @*/ 158 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 159 { 160 PetscErrorCode ierr; 161 162 PetscFunctionBegin; 163 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 164 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 165 PetscFunctionReturn(0); 166 } 167 168 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 169 { 170 PetscErrorCode ierr; 171 MatCUSPARSEStorageFormat format; 172 PetscBool flg; 173 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 174 175 PetscFunctionBegin; 176 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 177 if (A->factortype==MAT_FACTOR_NONE) { 178 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 179 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 180 if (flg) { 181 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 182 } 183 } 184 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 185 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 186 if (flg) { 187 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 188 } 189 ierr = PetscOptionsTail();CHKERRQ(ierr); 190 PetscFunctionReturn(0); 191 192 } 193 194 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 195 { 196 PetscErrorCode ierr; 197 198 PetscFunctionBegin; 199 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 200 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 201 PetscFunctionReturn(0); 202 } 203 204 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 205 { 206 PetscErrorCode ierr; 207 208 PetscFunctionBegin; 209 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 210 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 211 PetscFunctionReturn(0); 212 } 213 214 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 215 { 216 PetscErrorCode ierr; 217 218 PetscFunctionBegin; 219 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 220 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 221 PetscFunctionReturn(0); 222 } 223 224 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 225 { 226 PetscErrorCode ierr; 227 228 PetscFunctionBegin; 229 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 230 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 231 PetscFunctionReturn(0); 232 } 233 234 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 235 { 236 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 237 PetscInt n = A->rmap->n; 238 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 239 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 240 cusparseStatus_t stat; 241 const PetscInt *ai = a->i,*aj = a->j,*vi; 242 const MatScalar *aa = a->a,*v; 243 PetscInt *AiLo, *AjLo; 244 PetscScalar *AALo; 245 PetscInt i,nz, nzLower, offset, rowOffset; 246 PetscErrorCode ierr; 247 248 PetscFunctionBegin; 249 if (!n) PetscFunctionReturn(0); 250 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 251 try { 252 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 253 nzLower=n+ai[n]-ai[1]; 254 255 /* Allocate Space for the lower triangular matrix */ 256 ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 257 ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr); 258 ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr); 259 260 /* Fill the lower triangular matrix */ 261 AiLo[0] = (PetscInt) 0; 262 AiLo[n] = nzLower; 263 AjLo[0] = (PetscInt) 0; 264 AALo[0] = (MatScalar) 1.0; 265 v = aa; 266 vi = aj; 267 offset = 1; 268 rowOffset= 1; 269 for (i=1; i<n; i++) { 270 nz = ai[i+1] - ai[i]; 271 /* additional 1 for the term on the diagonal */ 272 AiLo[i] = rowOffset; 273 rowOffset += nz+1; 274 275 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 276 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 277 278 offset += nz; 279 AjLo[offset] = (PetscInt) i; 280 AALo[offset] = (MatScalar) 1.0; 281 offset += 1; 282 283 v += nz; 284 vi += nz; 285 } 286 287 /* allocate space for the triangular factor information */ 288 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 289 290 /* Create the matrix description */ 291 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 292 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 293 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 294 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat); 295 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 296 297 /* Create the solve analysis information */ 298 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 299 300 /* set the operation */ 301 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 302 303 /* set the matrix */ 304 loTriFactor->csrMat = new CsrMatrix; 305 loTriFactor->csrMat->num_rows = n; 306 loTriFactor->csrMat->num_cols = n; 307 loTriFactor->csrMat->num_entries = nzLower; 308 309 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 310 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 311 312 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 313 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 314 315 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 316 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 317 318 /* perform the solve analysis */ 319 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 320 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 321 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 322 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 323 324 /* assign the pointer. Is this really necessary? */ 325 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 326 327 ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr); 328 ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr); 329 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 330 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 331 } catch(char *ex) { 332 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 333 } 334 } 335 PetscFunctionReturn(0); 336 } 337 338 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 339 { 340 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 341 PetscInt n = A->rmap->n; 342 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 343 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 344 cusparseStatus_t stat; 345 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 346 const MatScalar *aa = a->a,*v; 347 PetscInt *AiUp, *AjUp; 348 PetscScalar *AAUp; 349 PetscInt i,nz, nzUpper, offset; 350 PetscErrorCode ierr; 351 352 PetscFunctionBegin; 353 if (!n) PetscFunctionReturn(0); 354 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 355 try { 356 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 357 nzUpper = adiag[0]-adiag[n]; 358 359 /* Allocate Space for the upper triangular matrix */ 360 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 361 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 362 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 363 364 /* Fill the upper triangular matrix */ 365 AiUp[0]=(PetscInt) 0; 366 AiUp[n]=nzUpper; 367 offset = nzUpper; 368 for (i=n-1; i>=0; i--) { 369 v = aa + adiag[i+1] + 1; 370 vi = aj + adiag[i+1] + 1; 371 372 /* number of elements NOT on the diagonal */ 373 nz = adiag[i] - adiag[i+1]-1; 374 375 /* decrement the offset */ 376 offset -= (nz+1); 377 378 /* first, set the diagonal elements */ 379 AjUp[offset] = (PetscInt) i; 380 AAUp[offset] = (MatScalar)1./v[nz]; 381 AiUp[i] = AiUp[i+1] - (nz+1); 382 383 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 384 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 385 } 386 387 /* allocate space for the triangular factor information */ 388 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 389 390 /* Create the matrix description */ 391 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 392 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 393 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 394 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 395 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 396 397 /* Create the solve analysis information */ 398 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 399 400 /* set the operation */ 401 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 402 403 /* set the matrix */ 404 upTriFactor->csrMat = new CsrMatrix; 405 upTriFactor->csrMat->num_rows = n; 406 upTriFactor->csrMat->num_cols = n; 407 upTriFactor->csrMat->num_entries = nzUpper; 408 409 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 410 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 411 412 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 413 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 414 415 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 416 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 417 418 /* perform the solve analysis */ 419 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 420 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 421 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 422 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 423 424 /* assign the pointer. Is this really necessary? */ 425 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 426 427 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 428 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 429 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 430 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 431 } catch(char *ex) { 432 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 433 } 434 } 435 PetscFunctionReturn(0); 436 } 437 438 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 439 { 440 PetscErrorCode ierr; 441 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 442 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 443 IS isrow = a->row,iscol = a->icol; 444 PetscBool row_identity,col_identity; 445 const PetscInt *r,*c; 446 PetscInt n = A->rmap->n; 447 448 PetscFunctionBegin; 449 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 450 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 451 452 cusparseTriFactors->workVector = new THRUSTARRAY(n); 453 cusparseTriFactors->nnz=a->nz; 454 455 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 456 /* lower triangular indices */ 457 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 458 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 459 if (!row_identity) { 460 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 461 cusparseTriFactors->rpermIndices->assign(r, r+n); 462 } 463 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 464 465 /* upper triangular indices */ 466 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 467 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 468 if (!col_identity) { 469 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 470 cusparseTriFactors->cpermIndices->assign(c, c+n); 471 } 472 473 if (!row_identity && !col_identity) { 474 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 475 } else if(!row_identity) { 476 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 477 } else if(!col_identity) { 478 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 479 } 480 481 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 482 PetscFunctionReturn(0); 483 } 484 485 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 486 { 487 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 488 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 489 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 490 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 491 cusparseStatus_t stat; 492 PetscErrorCode ierr; 493 PetscInt *AiUp, *AjUp; 494 PetscScalar *AAUp; 495 PetscScalar *AALo; 496 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 497 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 498 const PetscInt *ai = b->i,*aj = b->j,*vj; 499 const MatScalar *aa = b->a,*v; 500 501 PetscFunctionBegin; 502 if (!n) PetscFunctionReturn(0); 503 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 504 try { 505 /* Allocate Space for the upper triangular matrix */ 506 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 507 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 508 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 509 ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 510 511 /* Fill the upper triangular matrix */ 512 AiUp[0]=(PetscInt) 0; 513 AiUp[n]=nzUpper; 514 offset = 0; 515 for (i=0; i<n; i++) { 516 /* set the pointers */ 517 v = aa + ai[i]; 518 vj = aj + ai[i]; 519 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 520 521 /* first, set the diagonal elements */ 522 AjUp[offset] = (PetscInt) i; 523 AAUp[offset] = (MatScalar)1.0/v[nz]; 524 AiUp[i] = offset; 525 AALo[offset] = (MatScalar)1.0/v[nz]; 526 527 offset+=1; 528 if (nz>0) { 529 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 530 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 531 for (j=offset; j<offset+nz; j++) { 532 AAUp[j] = -AAUp[j]; 533 AALo[j] = AAUp[j]/v[nz]; 534 } 535 offset+=nz; 536 } 537 } 538 539 /* allocate space for the triangular factor information */ 540 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 541 542 /* Create the matrix description */ 543 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 544 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 545 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 546 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 547 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 548 549 /* Create the solve analysis information */ 550 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 551 552 /* set the operation */ 553 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 554 555 /* set the matrix */ 556 upTriFactor->csrMat = new CsrMatrix; 557 upTriFactor->csrMat->num_rows = A->rmap->n; 558 upTriFactor->csrMat->num_cols = A->cmap->n; 559 upTriFactor->csrMat->num_entries = a->nz; 560 561 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 562 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 563 564 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 565 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 566 567 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 568 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 569 570 /* perform the solve analysis */ 571 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 572 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 573 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 574 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 575 576 /* assign the pointer. Is this really necessary? */ 577 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 578 579 /* allocate space for the triangular factor information */ 580 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 581 582 /* Create the matrix description */ 583 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 584 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 585 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 586 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 587 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 588 589 /* Create the solve analysis information */ 590 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 591 592 /* set the operation */ 593 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 594 595 /* set the matrix */ 596 loTriFactor->csrMat = new CsrMatrix; 597 loTriFactor->csrMat->num_rows = A->rmap->n; 598 loTriFactor->csrMat->num_cols = A->cmap->n; 599 loTriFactor->csrMat->num_entries = a->nz; 600 601 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 602 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 603 604 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 605 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 606 607 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 608 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 609 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 610 611 /* perform the solve analysis */ 612 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 613 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 614 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 615 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 616 617 /* assign the pointer. Is this really necessary? */ 618 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 619 620 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 621 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 622 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 623 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 624 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 625 } catch(char *ex) { 626 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 627 } 628 } 629 PetscFunctionReturn(0); 630 } 631 632 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 633 { 634 PetscErrorCode ierr; 635 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 636 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 637 IS ip = a->row; 638 const PetscInt *rip; 639 PetscBool perm_identity; 640 PetscInt n = A->rmap->n; 641 642 PetscFunctionBegin; 643 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 644 cusparseTriFactors->workVector = new THRUSTARRAY(n); 645 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 646 647 /* lower triangular indices */ 648 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 649 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 650 if (!perm_identity) { 651 IS iip; 652 const PetscInt *irip; 653 654 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 655 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 656 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 657 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 658 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 659 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 660 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 661 ierr = ISDestroy(&iip);CHKERRQ(ierr); 662 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 663 } 664 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 665 PetscFunctionReturn(0); 666 } 667 668 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 669 { 670 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 671 IS isrow = b->row,iscol = b->col; 672 PetscBool row_identity,col_identity; 673 PetscErrorCode ierr; 674 675 PetscFunctionBegin; 676 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 677 /* determine which version of MatSolve needs to be used. */ 678 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 679 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 680 if (row_identity && col_identity) { 681 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 682 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 683 B->ops->matsolve = NULL; 684 B->ops->matsolvetranspose = NULL; 685 } else { 686 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 687 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 688 B->ops->matsolve = NULL; 689 B->ops->matsolvetranspose = NULL; 690 } 691 692 /* get the triangular factors */ 693 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 694 PetscFunctionReturn(0); 695 } 696 697 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 698 { 699 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 700 IS ip = b->row; 701 PetscBool perm_identity; 702 PetscErrorCode ierr; 703 704 PetscFunctionBegin; 705 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 706 707 /* determine which version of MatSolve needs to be used. */ 708 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 709 if (perm_identity) { 710 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 711 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 712 B->ops->matsolve = NULL; 713 B->ops->matsolvetranspose = NULL; 714 } else { 715 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 716 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 717 B->ops->matsolve = NULL; 718 B->ops->matsolvetranspose = NULL; 719 } 720 721 /* get the triangular factors */ 722 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 723 PetscFunctionReturn(0); 724 } 725 726 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 727 { 728 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 729 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 730 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 731 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 732 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 733 cusparseStatus_t stat; 734 cusparseIndexBase_t indexBase; 735 cusparseMatrixType_t matrixType; 736 cusparseFillMode_t fillMode; 737 cusparseDiagType_t diagType; 738 739 PetscFunctionBegin; 740 741 /*********************************************/ 742 /* Now the Transpose of the Lower Tri Factor */ 743 /*********************************************/ 744 745 /* allocate space for the transpose of the lower triangular factor */ 746 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 747 748 /* set the matrix descriptors of the lower triangular factor */ 749 matrixType = cusparseGetMatType(loTriFactor->descr); 750 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 751 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 752 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 753 diagType = cusparseGetMatDiagType(loTriFactor->descr); 754 755 /* Create the matrix description */ 756 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat); 757 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat); 758 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat); 759 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat); 760 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat); 761 762 /* Create the solve analysis information */ 763 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat); 764 765 /* set the operation */ 766 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 767 768 /* allocate GPU space for the CSC of the lower triangular factor*/ 769 loTriFactorT->csrMat = new CsrMatrix; 770 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 771 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 772 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 773 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 774 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 775 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 776 777 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 778 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 779 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 780 loTriFactor->csrMat->values->data().get(), 781 loTriFactor->csrMat->row_offsets->data().get(), 782 loTriFactor->csrMat->column_indices->data().get(), 783 loTriFactorT->csrMat->values->data().get(), 784 loTriFactorT->csrMat->column_indices->data().get(), 785 loTriFactorT->csrMat->row_offsets->data().get(), 786 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 787 788 /* perform the solve analysis on the transposed matrix */ 789 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 790 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 791 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 792 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 793 loTriFactorT->solveInfo);CHKERRCUDA(stat); 794 795 /* assign the pointer. Is this really necessary? */ 796 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 797 798 /*********************************************/ 799 /* Now the Transpose of the Upper Tri Factor */ 800 /*********************************************/ 801 802 /* allocate space for the transpose of the upper triangular factor */ 803 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 804 805 /* set the matrix descriptors of the upper triangular factor */ 806 matrixType = cusparseGetMatType(upTriFactor->descr); 807 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 808 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 809 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 810 diagType = cusparseGetMatDiagType(upTriFactor->descr); 811 812 /* Create the matrix description */ 813 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat); 814 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat); 815 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat); 816 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat); 817 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat); 818 819 /* Create the solve analysis information */ 820 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat); 821 822 /* set the operation */ 823 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 824 825 /* allocate GPU space for the CSC of the upper triangular factor*/ 826 upTriFactorT->csrMat = new CsrMatrix; 827 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 828 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 829 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 830 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 831 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 832 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 833 834 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 835 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 836 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 837 upTriFactor->csrMat->values->data().get(), 838 upTriFactor->csrMat->row_offsets->data().get(), 839 upTriFactor->csrMat->column_indices->data().get(), 840 upTriFactorT->csrMat->values->data().get(), 841 upTriFactorT->csrMat->column_indices->data().get(), 842 upTriFactorT->csrMat->row_offsets->data().get(), 843 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 844 845 /* perform the solve analysis on the transposed matrix */ 846 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 847 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 848 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 849 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 850 upTriFactorT->solveInfo);CHKERRCUDA(stat); 851 852 /* assign the pointer. Is this really necessary? */ 853 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 854 PetscFunctionReturn(0); 855 } 856 857 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 858 { 859 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 860 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 861 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 862 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 863 cusparseStatus_t stat; 864 cusparseIndexBase_t indexBase; 865 cudaError_t err; 866 PetscErrorCode ierr; 867 868 PetscFunctionBegin; 869 870 /* allocate space for the triangular factor information */ 871 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 872 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat); 873 indexBase = cusparseGetMatIndexBase(matstruct->descr); 874 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat); 875 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 876 877 /* set alpha and beta */ 878 err = cudaMalloc((void **)&(matstructT->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 879 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 880 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 881 err = cudaMemcpy(matstructT->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 882 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 883 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 884 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 885 886 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 887 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 888 CsrMatrix *matrixT= new CsrMatrix; 889 thrust::device_vector<int> *rowoffsets_gpu; 890 const int ncomprow = matstruct->cprowIndices->size(); 891 thrust::host_vector<int> cprow(ncomprow); 892 cprow = *matstruct->cprowIndices; // GPU --> CPU 893 matrixT->num_rows = A->cmap->n; 894 matrixT->num_cols = A->rmap->n; 895 matrixT->num_entries = a->nz; 896 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 897 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 898 matrixT->values = new THRUSTARRAY(a->nz); 899 PetscInt k,i; 900 thrust::host_vector<int> rowst_host(A->rmap->n+1); 901 902 /* expand compress rows, which is forced in constructor (MatSeqAIJCUSPARSECopyToGPU) */ 903 rowst_host[0] = 0; 904 for (k = 0, i = 0; i < A->rmap->n ; i++) { 905 if (k < ncomprow && i==cprow[k]) { 906 rowst_host[i+1] = a->i[i+1]; 907 k++; 908 } else rowst_host[i+1] = rowst_host[i]; 909 } 910 if (k!=ncomprow) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"%D != %D",k,ncomprow); 911 rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 912 *rowoffsets_gpu = rowst_host; // CPU --> GPU 913 914 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 915 indexBase = cusparseGetMatIndexBase(matstruct->descr); 916 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 917 A->cmap->n, matrix->num_entries, 918 matrix->values->data().get(), 919 rowoffsets_gpu->data().get(), 920 matrix->column_indices->data().get(), 921 matrixT->values->data().get(), 922 matrixT->column_indices->data().get(), 923 matrixT->row_offsets->data().get(), 924 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 925 926 /* assign the pointer */ 927 matstructT->mat = matrixT; 928 ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 929 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 930 /* First convert HYB to CSR */ 931 CsrMatrix *temp= new CsrMatrix; 932 temp->num_rows = A->rmap->n; 933 temp->num_cols = A->cmap->n; 934 temp->num_entries = a->nz; 935 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 936 temp->column_indices = new THRUSTINTARRAY32(a->nz); 937 temp->values = new THRUSTARRAY(a->nz); 938 939 940 stat = cusparse_hyb2csr(cusparsestruct->handle, 941 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 942 temp->values->data().get(), 943 temp->row_offsets->data().get(), 944 temp->column_indices->data().get());CHKERRCUDA(stat); 945 946 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 947 CsrMatrix *tempT= new CsrMatrix; 948 tempT->num_rows = A->rmap->n; 949 tempT->num_cols = A->cmap->n; 950 tempT->num_entries = a->nz; 951 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 952 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 953 tempT->values = new THRUSTARRAY(a->nz); 954 955 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 956 temp->num_cols, temp->num_entries, 957 temp->values->data().get(), 958 temp->row_offsets->data().get(), 959 temp->column_indices->data().get(), 960 tempT->values->data().get(), 961 tempT->column_indices->data().get(), 962 tempT->row_offsets->data().get(), 963 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 964 965 /* Last, convert CSC to HYB */ 966 cusparseHybMat_t hybMat; 967 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 968 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 969 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 970 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 971 matstructT->descr, tempT->values->data().get(), 972 tempT->row_offsets->data().get(), 973 tempT->column_indices->data().get(), 974 hybMat, 0, partition);CHKERRCUDA(stat); 975 976 /* assign the pointer */ 977 matstructT->mat = hybMat; 978 ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr); 979 980 /* delete temporaries */ 981 if (tempT) { 982 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 983 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 984 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 985 delete (CsrMatrix*) tempT; 986 } 987 if (temp) { 988 if (temp->values) delete (THRUSTARRAY*) temp->values; 989 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 990 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 991 delete (CsrMatrix*) temp; 992 } 993 } 994 /* assign the compressed row indices */ 995 matstructT->cprowIndices = new THRUSTINTARRAY; 996 matstructT->cprowIndices->resize(A->cmap->n); 997 thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end()); 998 /* assign the pointer */ 999 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1000 PetscFunctionReturn(0); 1001 } 1002 1003 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1004 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1005 { 1006 PetscInt n = xx->map->n; 1007 const PetscScalar *barray; 1008 PetscScalar *xarray; 1009 thrust::device_ptr<const PetscScalar> bGPU; 1010 thrust::device_ptr<PetscScalar> xGPU; 1011 cusparseStatus_t stat; 1012 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1013 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1014 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1015 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1016 PetscErrorCode ierr; 1017 1018 PetscFunctionBegin; 1019 /* Analyze the matrix and create the transpose ... on the fly */ 1020 if (!loTriFactorT && !upTriFactorT) { 1021 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1022 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1023 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1024 } 1025 1026 /* Get the GPU pointers */ 1027 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1028 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1029 xGPU = thrust::device_pointer_cast(xarray); 1030 bGPU = thrust::device_pointer_cast(barray); 1031 1032 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1033 /* First, reorder with the row permutation */ 1034 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1035 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1036 xGPU); 1037 1038 /* First, solve U */ 1039 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1040 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1041 upTriFactorT->csrMat->values->data().get(), 1042 upTriFactorT->csrMat->row_offsets->data().get(), 1043 upTriFactorT->csrMat->column_indices->data().get(), 1044 upTriFactorT->solveInfo, 1045 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1046 1047 /* Then, solve L */ 1048 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1049 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1050 loTriFactorT->csrMat->values->data().get(), 1051 loTriFactorT->csrMat->row_offsets->data().get(), 1052 loTriFactorT->csrMat->column_indices->data().get(), 1053 loTriFactorT->solveInfo, 1054 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1055 1056 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1057 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1058 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1059 tempGPU->begin()); 1060 1061 /* Copy the temporary to the full solution. */ 1062 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1063 1064 /* restore */ 1065 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1066 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1067 ierr = WaitForGPU();CHKERRCUDA(ierr); 1068 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1069 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1070 PetscFunctionReturn(0); 1071 } 1072 1073 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1074 { 1075 const PetscScalar *barray; 1076 PetscScalar *xarray; 1077 cusparseStatus_t stat; 1078 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1079 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1080 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1081 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1082 PetscErrorCode ierr; 1083 1084 PetscFunctionBegin; 1085 /* Analyze the matrix and create the transpose ... on the fly */ 1086 if (!loTriFactorT && !upTriFactorT) { 1087 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1088 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1089 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1090 } 1091 1092 /* Get the GPU pointers */ 1093 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1094 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1095 1096 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1097 /* First, solve U */ 1098 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1099 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1100 upTriFactorT->csrMat->values->data().get(), 1101 upTriFactorT->csrMat->row_offsets->data().get(), 1102 upTriFactorT->csrMat->column_indices->data().get(), 1103 upTriFactorT->solveInfo, 1104 barray, tempGPU->data().get());CHKERRCUDA(stat); 1105 1106 /* Then, solve L */ 1107 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1108 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1109 loTriFactorT->csrMat->values->data().get(), 1110 loTriFactorT->csrMat->row_offsets->data().get(), 1111 loTriFactorT->csrMat->column_indices->data().get(), 1112 loTriFactorT->solveInfo, 1113 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1114 1115 /* restore */ 1116 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1117 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1118 ierr = WaitForGPU();CHKERRCUDA(ierr); 1119 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1120 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1121 PetscFunctionReturn(0); 1122 } 1123 1124 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1125 { 1126 const PetscScalar *barray; 1127 PetscScalar *xarray; 1128 thrust::device_ptr<const PetscScalar> bGPU; 1129 thrust::device_ptr<PetscScalar> xGPU; 1130 cusparseStatus_t stat; 1131 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1132 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1133 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1134 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1135 PetscErrorCode ierr; 1136 1137 PetscFunctionBegin; 1138 1139 /* Get the GPU pointers */ 1140 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1141 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1142 xGPU = thrust::device_pointer_cast(xarray); 1143 bGPU = thrust::device_pointer_cast(barray); 1144 1145 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1146 /* First, reorder with the row permutation */ 1147 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1148 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1149 tempGPU->begin()); 1150 1151 /* Next, solve L */ 1152 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1153 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1154 loTriFactor->csrMat->values->data().get(), 1155 loTriFactor->csrMat->row_offsets->data().get(), 1156 loTriFactor->csrMat->column_indices->data().get(), 1157 loTriFactor->solveInfo, 1158 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1159 1160 /* Then, solve U */ 1161 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1162 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1163 upTriFactor->csrMat->values->data().get(), 1164 upTriFactor->csrMat->row_offsets->data().get(), 1165 upTriFactor->csrMat->column_indices->data().get(), 1166 upTriFactor->solveInfo, 1167 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1168 1169 /* Last, reorder with the column permutation */ 1170 thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1171 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1172 xGPU); 1173 1174 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1175 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1176 ierr = WaitForGPU();CHKERRCUDA(ierr); 1177 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1178 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1179 PetscFunctionReturn(0); 1180 } 1181 1182 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1183 { 1184 const PetscScalar *barray; 1185 PetscScalar *xarray; 1186 cusparseStatus_t stat; 1187 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1188 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1189 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1190 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1191 PetscErrorCode ierr; 1192 1193 PetscFunctionBegin; 1194 /* Get the GPU pointers */ 1195 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1196 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1197 1198 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1199 /* First, solve L */ 1200 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1201 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1202 loTriFactor->csrMat->values->data().get(), 1203 loTriFactor->csrMat->row_offsets->data().get(), 1204 loTriFactor->csrMat->column_indices->data().get(), 1205 loTriFactor->solveInfo, 1206 barray, tempGPU->data().get());CHKERRCUDA(stat); 1207 1208 /* Next, solve U */ 1209 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1210 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1211 upTriFactor->csrMat->values->data().get(), 1212 upTriFactor->csrMat->row_offsets->data().get(), 1213 upTriFactor->csrMat->column_indices->data().get(), 1214 upTriFactor->solveInfo, 1215 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1216 1217 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1218 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1219 ierr = WaitForGPU();CHKERRCUDA(ierr); 1220 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1221 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1222 PetscFunctionReturn(0); 1223 } 1224 1225 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1226 { 1227 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1228 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1229 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1230 PetscInt m = A->rmap->n,*ii,*ridx; 1231 PetscErrorCode ierr; 1232 cusparseStatus_t stat; 1233 cudaError_t err; 1234 1235 PetscFunctionBegin; 1236 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 1237 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1238 if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1239 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1240 /* copy values only */ 1241 matrix->values->assign(a->a, a->a+a->nz); 1242 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1243 } else { 1244 MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format); 1245 try { 1246 cusparsestruct->nonzerorow=0; 1247 for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0); 1248 1249 if (a->compressedrow.use) { 1250 m = a->compressedrow.nrows; 1251 ii = a->compressedrow.i; 1252 ridx = a->compressedrow.rindex; 1253 } else { 1254 /* Forcing compressed row on the GPU */ 1255 int k=0; 1256 ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr); 1257 ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr); 1258 ii[0]=0; 1259 for (int j = 0; j<m; j++) { 1260 if ((a->i[j+1]-a->i[j])>0) { 1261 ii[k] = a->i[j]; 1262 ridx[k]= j; 1263 k++; 1264 } 1265 } 1266 ii[cusparsestruct->nonzerorow] = a->nz; 1267 m = cusparsestruct->nonzerorow; 1268 } 1269 1270 /* allocate space for the triangular factor information */ 1271 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1272 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat); 1273 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 1274 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 1275 1276 err = cudaMalloc((void **)&(matstruct->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 1277 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1278 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1279 err = cudaMemcpy(matstruct->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1280 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1281 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1282 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 1283 1284 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1285 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1286 /* set the matrix */ 1287 CsrMatrix *matrix= new CsrMatrix; 1288 matrix->num_rows = m; 1289 matrix->num_cols = A->cmap->n; 1290 matrix->num_entries = a->nz; 1291 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1292 matrix->row_offsets->assign(ii, ii + m+1); 1293 1294 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1295 matrix->column_indices->assign(a->j, a->j+a->nz); 1296 1297 matrix->values = new THRUSTARRAY(a->nz); 1298 matrix->values->assign(a->a, a->a+a->nz); 1299 1300 /* assign the pointer */ 1301 matstruct->mat = matrix; 1302 1303 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1304 CsrMatrix *matrix= new CsrMatrix; 1305 matrix->num_rows = m; 1306 matrix->num_cols = A->cmap->n; 1307 matrix->num_entries = a->nz; 1308 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1309 matrix->row_offsets->assign(ii, ii + m+1); 1310 1311 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1312 matrix->column_indices->assign(a->j, a->j+a->nz); 1313 1314 matrix->values = new THRUSTARRAY(a->nz); 1315 matrix->values->assign(a->a, a->a+a->nz); 1316 1317 cusparseHybMat_t hybMat; 1318 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 1319 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1320 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1321 stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1322 matstruct->descr, matrix->values->data().get(), 1323 matrix->row_offsets->data().get(), 1324 matrix->column_indices->data().get(), 1325 hybMat, 0, partition);CHKERRCUDA(stat); 1326 /* assign the pointer */ 1327 matstruct->mat = hybMat; 1328 1329 if (matrix) { 1330 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1331 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1332 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1333 delete (CsrMatrix*)matrix; 1334 } 1335 } 1336 1337 /* assign the compressed row indices */ 1338 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1339 matstruct->cprowIndices->assign(ridx,ridx+m); 1340 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+m*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1341 1342 /* assign the pointer */ 1343 cusparsestruct->mat = matstruct; 1344 1345 if (!a->compressedrow.use) { 1346 ierr = PetscFree(ii);CHKERRQ(ierr); 1347 ierr = PetscFree(ridx);CHKERRQ(ierr); 1348 } 1349 cusparsestruct->workVector = new THRUSTARRAY(m); 1350 } catch(char *ex) { 1351 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1352 } 1353 cusparsestruct->nonzerostate = A->nonzerostate; 1354 } 1355 ierr = WaitForGPU();CHKERRCUDA(ierr); 1356 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 1357 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1358 } 1359 PetscFunctionReturn(0); 1360 } 1361 1362 struct VecCUDAPlusEquals 1363 { 1364 template <typename Tuple> 1365 __host__ __device__ 1366 void operator()(Tuple t) 1367 { 1368 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1369 } 1370 }; 1371 1372 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1373 { 1374 PetscErrorCode ierr; 1375 1376 PetscFunctionBegin; 1377 ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr); 1378 PetscFunctionReturn(0); 1379 } 1380 1381 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1382 { 1383 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1384 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1385 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1386 const PetscScalar *xarray; 1387 PetscScalar *yarray; 1388 PetscErrorCode ierr; 1389 cusparseStatus_t stat; 1390 1391 PetscFunctionBegin; 1392 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1393 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1394 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1395 if (!matstructT) { 1396 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1397 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1398 } 1399 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1400 ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1401 1402 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1403 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1404 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1405 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1406 mat->num_rows, mat->num_cols, 1407 mat->num_entries, matstructT->alpha, matstructT->descr, 1408 mat->values->data().get(), mat->row_offsets->data().get(), 1409 mat->column_indices->data().get(), xarray, matstructT->beta_zero, 1410 yarray);CHKERRCUDA(stat); 1411 } else { 1412 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1413 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1414 matstructT->alpha, matstructT->descr, hybMat, 1415 xarray, matstructT->beta_zero, 1416 yarray);CHKERRCUDA(stat); 1417 } 1418 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1419 ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1420 ierr = WaitForGPU();CHKERRCUDA(ierr); 1421 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1422 ierr = PetscLogGpuFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1423 PetscFunctionReturn(0); 1424 } 1425 1426 1427 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1428 { 1429 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1430 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1431 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 1432 const PetscScalar *xarray; 1433 PetscScalar *zarray,*dptr,*beta; 1434 PetscErrorCode ierr; 1435 cusparseStatus_t stat; 1436 PetscBool cmpr; /* if the matrix has been compressed (zero rows) */ 1437 1438 PetscFunctionBegin; 1439 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1440 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1441 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1442 try { 1443 cmpr = (PetscBool)(cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n)); 1444 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1445 if (yy && !cmpr) { /* MatMultAdd with noncompressed storage -> need uptodate zz vector */ 1446 ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr); 1447 } else { 1448 ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1449 } 1450 dptr = cmpr ? zarray : cusparsestruct->workVector->data().get(); 1451 beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero; 1452 1453 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1454 /* csr_spmv is multiply add */ 1455 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1456 /* here we need to be careful to set the number of rows in the multiply to the 1457 number of compressed rows in the matrix ... which is equivalent to the 1458 size of the workVector */ 1459 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1460 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1461 mat->num_rows, mat->num_cols, 1462 mat->num_entries, matstruct->alpha, matstruct->descr, 1463 mat->values->data().get(), mat->row_offsets->data().get(), 1464 mat->column_indices->data().get(), xarray, beta, 1465 dptr);CHKERRCUDA(stat); 1466 } else { 1467 if (cusparsestruct->workVector->size()) { 1468 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1469 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1470 matstruct->alpha, matstruct->descr, hybMat, 1471 xarray, beta, 1472 dptr);CHKERRCUDA(stat); 1473 } 1474 } 1475 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1476 1477 if (yy) { 1478 if (dptr != zarray) { 1479 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1480 } else if (zz != yy) { 1481 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); 1482 } 1483 } else if (dptr != zarray) { 1484 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 1485 } 1486 /* scatter the data from the temporary into the full vector with a += operation */ 1487 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1488 if (dptr != zarray) { 1489 thrust::device_ptr<PetscScalar> zptr; 1490 1491 zptr = thrust::device_pointer_cast(zarray); 1492 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 1493 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1494 VecCUDAPlusEquals()); 1495 } 1496 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1497 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1498 if (yy && !cmpr) { 1499 ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr); 1500 } else { 1501 ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1502 } 1503 } catch(char *ex) { 1504 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1505 } 1506 if (!yy) { /* MatMult */ 1507 if (!cusparsestruct->stream) { 1508 ierr = WaitForGPU();CHKERRCUDA(ierr); 1509 } 1510 } 1511 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1512 PetscFunctionReturn(0); 1513 } 1514 1515 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1516 { 1517 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1518 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1519 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1520 const PetscScalar *xarray; 1521 PetscScalar *zarray,*dptr,*beta; 1522 PetscErrorCode ierr; 1523 cusparseStatus_t stat; 1524 1525 PetscFunctionBegin; 1526 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1527 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1528 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1529 if (!matstructT) { 1530 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1531 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1532 } 1533 1534 try { 1535 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1536 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1537 ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr); 1538 dptr = cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->cmap->n) ? zarray : cusparsestruct->workVector->data().get(); 1539 beta = (yy == zz && dptr == zarray) ? matstructT->beta_one : matstructT->beta_zero; 1540 1541 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1542 /* multiply add with matrix transpose */ 1543 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1544 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1545 /* here we need to be careful to set the number of rows in the multiply to the 1546 number of compressed rows in the matrix ... which is equivalent to the 1547 size of the workVector */ 1548 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1549 mat->num_rows, mat->num_cols, 1550 mat->num_entries, matstructT->alpha, matstructT->descr, 1551 mat->values->data().get(), mat->row_offsets->data().get(), 1552 mat->column_indices->data().get(), xarray, beta, 1553 dptr);CHKERRCUDA(stat); 1554 } else { 1555 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1556 if (cusparsestruct->workVector->size()) { 1557 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1558 matstructT->alpha, matstructT->descr, hybMat, 1559 xarray, beta, 1560 dptr);CHKERRCUDA(stat); 1561 } 1562 } 1563 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1564 1565 if (dptr != zarray) { 1566 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1567 } else if (zz != yy) { 1568 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); 1569 } 1570 /* scatter the data from the temporary into the full vector with a += operation */ 1571 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1572 if (dptr != zarray) { 1573 thrust::device_ptr<PetscScalar> zptr; 1574 1575 zptr = thrust::device_pointer_cast(zarray); 1576 1577 /* scatter the data from the temporary into the full vector with a += operation */ 1578 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))), 1579 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n, 1580 VecCUDAPlusEquals()); 1581 } 1582 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1583 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1584 ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr); 1585 } catch(char *ex) { 1586 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1587 } 1588 ierr = WaitForGPU();CHKERRCUDA(ierr); /* is this needed? just for yy==0 in Mult */ 1589 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1590 PetscFunctionReturn(0); 1591 } 1592 1593 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1594 { 1595 PetscErrorCode ierr; 1596 1597 PetscFunctionBegin; 1598 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1599 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1600 if (A->factortype == MAT_FACTOR_NONE) { 1601 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1602 } 1603 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1604 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1605 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1606 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1607 PetscFunctionReturn(0); 1608 } 1609 1610 /* --------------------------------------------------------------------------------*/ 1611 /*@ 1612 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1613 (the default parallel PETSc format). This matrix will ultimately pushed down 1614 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1615 assembly performance the user should preallocate the matrix storage by setting 1616 the parameter nz (or the array nnz). By setting these parameters accurately, 1617 performance during matrix assembly can be increased by more than a factor of 50. 1618 1619 Collective 1620 1621 Input Parameters: 1622 + comm - MPI communicator, set to PETSC_COMM_SELF 1623 . m - number of rows 1624 . n - number of columns 1625 . nz - number of nonzeros per row (same for all rows) 1626 - nnz - array containing the number of nonzeros in the various rows 1627 (possibly different for each row) or NULL 1628 1629 Output Parameter: 1630 . A - the matrix 1631 1632 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1633 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1634 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1635 1636 Notes: 1637 If nnz is given then nz is ignored 1638 1639 The AIJ format (also called the Yale sparse matrix format or 1640 compressed row storage), is fully compatible with standard Fortran 77 1641 storage. That is, the stored row and column indices can begin at 1642 either one (as in Fortran) or zero. See the users' manual for details. 1643 1644 Specify the preallocated storage with either nz or nnz (not both). 1645 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1646 allocation. For large problems you MUST preallocate memory or you 1647 will get TERRIBLE performance, see the users' manual chapter on matrices. 1648 1649 By default, this format uses inodes (identical nodes) when possible, to 1650 improve numerical efficiency of matrix-vector products and solves. We 1651 search for consecutive rows with the same nonzero structure, thereby 1652 reusing matrix information to achieve increased efficiency. 1653 1654 Level: intermediate 1655 1656 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1657 @*/ 1658 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1659 { 1660 PetscErrorCode ierr; 1661 1662 PetscFunctionBegin; 1663 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1664 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1665 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1666 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1667 PetscFunctionReturn(0); 1668 } 1669 1670 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1671 { 1672 PetscErrorCode ierr; 1673 1674 PetscFunctionBegin; 1675 if (A->factortype==MAT_FACTOR_NONE) { 1676 if (A->valid_GPU_matrix != PETSC_OFFLOAD_UNALLOCATED) { 1677 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 1678 } 1679 } else { 1680 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 1681 } 1682 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1683 PetscFunctionReturn(0); 1684 } 1685 1686 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 1687 { 1688 PetscErrorCode ierr; 1689 Mat C; 1690 cusparseStatus_t stat; 1691 cusparseHandle_t handle=0; 1692 1693 PetscFunctionBegin; 1694 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 1695 C = *B; 1696 ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr); 1697 ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr); 1698 1699 /* inject CUSPARSE-specific stuff */ 1700 if (C->factortype==MAT_FACTOR_NONE) { 1701 /* you cannot check the inode.use flag here since the matrix was just created. 1702 now build a GPU matrix data structure */ 1703 C->spptr = new Mat_SeqAIJCUSPARSE; 1704 ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat = 0; 1705 ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0; 1706 ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector = 0; 1707 ((Mat_SeqAIJCUSPARSE*)C->spptr)->format = MAT_CUSPARSE_CSR; 1708 ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream = 0; 1709 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1710 ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle = handle; 1711 ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0; 1712 } else { 1713 /* NEXT, set the pointers to the triangular factors */ 1714 C->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1715 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr = 0; 1716 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr = 0; 1717 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0; 1718 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0; 1719 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices = 0; 1720 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices = 0; 1721 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector = 0; 1722 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = 0; 1723 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1724 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = handle; 1725 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz = 0; 1726 } 1727 1728 C->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1729 C->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1730 C->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1731 C->ops->mult = MatMult_SeqAIJCUSPARSE; 1732 C->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1733 C->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1734 C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1735 C->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1736 1737 ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1738 1739 C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED; 1740 1741 ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1742 PetscFunctionReturn(0); 1743 } 1744 1745 PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B) 1746 { 1747 PetscErrorCode ierr; 1748 cusparseStatus_t stat; 1749 cusparseHandle_t handle=0; 1750 1751 PetscFunctionBegin; 1752 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 1753 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 1754 1755 if (B->factortype==MAT_FACTOR_NONE) { 1756 /* you cannot check the inode.use flag here since the matrix was just created. 1757 now build a GPU matrix data structure */ 1758 B->spptr = new Mat_SeqAIJCUSPARSE; 1759 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1760 ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0; 1761 ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0; 1762 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1763 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1764 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1765 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle; 1766 ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0; 1767 } else { 1768 /* NEXT, set the pointers to the triangular factors */ 1769 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1770 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 1771 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1772 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0; 1773 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0; 1774 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0; 1775 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0; 1776 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0; 1777 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1778 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle; 1779 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0; 1780 } 1781 1782 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1783 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1784 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1785 B->ops->mult = MatMult_SeqAIJCUSPARSE; 1786 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1787 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1788 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1789 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1790 1791 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1792 1793 B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED; 1794 1795 ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1796 PetscFunctionReturn(0); 1797 } 1798 1799 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 1800 { 1801 PetscErrorCode ierr; 1802 1803 PetscFunctionBegin; 1804 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 1805 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);CHKERRQ(ierr); 1806 PetscFunctionReturn(0); 1807 } 1808 1809 /*MC 1810 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1811 1812 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1813 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 1814 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 1815 1816 Options Database Keys: 1817 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1818 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1819 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1820 1821 Level: beginner 1822 1823 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1824 M*/ 1825 1826 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 1827 1828 1829 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 1830 { 1831 PetscErrorCode ierr; 1832 1833 PetscFunctionBegin; 1834 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1835 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1836 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1837 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1838 PetscFunctionReturn(0); 1839 } 1840 1841 1842 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 1843 { 1844 cusparseStatus_t stat; 1845 cusparseHandle_t handle; 1846 1847 PetscFunctionBegin; 1848 if (*cusparsestruct) { 1849 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format); 1850 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format); 1851 delete (*cusparsestruct)->workVector; 1852 if (handle = (*cusparsestruct)->handle) { 1853 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1854 } 1855 delete *cusparsestruct; 1856 *cusparsestruct = 0; 1857 } 1858 PetscFunctionReturn(0); 1859 } 1860 1861 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 1862 { 1863 PetscFunctionBegin; 1864 if (*mat) { 1865 delete (*mat)->values; 1866 delete (*mat)->column_indices; 1867 delete (*mat)->row_offsets; 1868 delete *mat; 1869 *mat = 0; 1870 } 1871 PetscFunctionReturn(0); 1872 } 1873 1874 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 1875 { 1876 cusparseStatus_t stat; 1877 PetscErrorCode ierr; 1878 1879 PetscFunctionBegin; 1880 if (*trifactor) { 1881 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); } 1882 if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); } 1883 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 1884 delete *trifactor; 1885 *trifactor = 0; 1886 } 1887 PetscFunctionReturn(0); 1888 } 1889 1890 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 1891 { 1892 CsrMatrix *mat; 1893 cusparseStatus_t stat; 1894 cudaError_t err; 1895 1896 PetscFunctionBegin; 1897 if (*matstruct) { 1898 if ((*matstruct)->mat) { 1899 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 1900 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 1901 stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat); 1902 } else { 1903 mat = (CsrMatrix*)(*matstruct)->mat; 1904 CsrMatrix_Destroy(&mat); 1905 } 1906 } 1907 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); } 1908 delete (*matstruct)->cprowIndices; 1909 if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); } 1910 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 1911 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 1912 delete *matstruct; 1913 *matstruct = 0; 1914 } 1915 PetscFunctionReturn(0); 1916 } 1917 1918 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 1919 { 1920 cusparseHandle_t handle; 1921 cusparseStatus_t stat; 1922 1923 PetscFunctionBegin; 1924 if (*trifactors) { 1925 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr); 1926 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr); 1927 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose); 1928 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose); 1929 delete (*trifactors)->rpermIndices; 1930 delete (*trifactors)->cpermIndices; 1931 delete (*trifactors)->workVector; 1932 if (handle = (*trifactors)->handle) { 1933 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1934 } 1935 delete *trifactors; 1936 *trifactors = 0; 1937 } 1938 PetscFunctionReturn(0); 1939 } 1940 1941