1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_CXX_COMPLEX_FIX 7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 8 9 #include <petscconf.h> 10 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11 #include <../src/mat/impls/sbaij/seq/sbaij.h> 12 #include <../src/vec/vec/impls/dvecimpl.h> 13 #include <petsc/private/vecimpl.h> 14 #undef VecType 15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 19 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 20 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 21 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 22 23 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 24 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 25 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 26 27 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 28 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 29 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 30 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 31 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 32 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 33 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 34 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 35 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 36 37 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 38 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 39 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 40 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 41 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 42 43 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 44 { 45 cusparseStatus_t stat; 46 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 47 48 PetscFunctionBegin; 49 cusparsestruct->stream = stream; 50 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat); 51 PetscFunctionReturn(0); 52 } 53 54 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 55 { 56 cusparseStatus_t stat; 57 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 58 59 PetscFunctionBegin; 60 if (cusparsestruct->handle != handle) { 61 if (cusparsestruct->handle) { 62 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat); 63 } 64 cusparsestruct->handle = handle; 65 } 66 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 67 PetscFunctionReturn(0); 68 } 69 70 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 71 { 72 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 73 PetscFunctionBegin; 74 if (cusparsestruct->handle) 75 cusparsestruct->handle = 0; 76 PetscFunctionReturn(0); 77 } 78 79 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 80 { 81 PetscFunctionBegin; 82 *type = MATSOLVERCUSPARSE; 83 PetscFunctionReturn(0); 84 } 85 86 /*MC 87 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 88 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 89 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 90 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 91 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 92 algorithms are not recommended. This class does NOT support direct solver operations. 93 94 Level: beginner 95 96 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 97 M*/ 98 99 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 100 { 101 PetscErrorCode ierr; 102 PetscInt n = A->rmap->n; 103 104 PetscFunctionBegin; 105 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 106 (*B)->factortype = ftype; 107 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 108 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 109 110 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 111 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 112 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 113 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 114 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 115 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 116 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 117 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 118 119 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 120 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 121 PetscFunctionReturn(0); 122 } 123 124 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 125 { 126 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 127 128 PetscFunctionBegin; 129 switch (op) { 130 case MAT_CUSPARSE_MULT: 131 cusparsestruct->format = format; 132 break; 133 case MAT_CUSPARSE_ALL: 134 cusparsestruct->format = format; 135 break; 136 default: 137 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 138 } 139 PetscFunctionReturn(0); 140 } 141 142 /*@ 143 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 144 operation. Only the MatMult operation can use different GPU storage formats 145 for MPIAIJCUSPARSE matrices. 146 Not Collective 147 148 Input Parameters: 149 + A - Matrix of type SEQAIJCUSPARSE 150 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 151 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 152 153 Output Parameter: 154 155 Level: intermediate 156 157 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 158 @*/ 159 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 160 { 161 PetscErrorCode ierr; 162 163 PetscFunctionBegin; 164 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 165 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 166 PetscFunctionReturn(0); 167 } 168 169 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 170 { 171 PetscErrorCode ierr; 172 MatCUSPARSEStorageFormat format; 173 PetscBool flg; 174 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 175 176 PetscFunctionBegin; 177 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 178 if (A->factortype==MAT_FACTOR_NONE) { 179 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 180 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 181 if (flg) { 182 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 183 } 184 } 185 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 186 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 187 if (flg) { 188 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 189 } 190 ierr = PetscOptionsTail();CHKERRQ(ierr); 191 PetscFunctionReturn(0); 192 193 } 194 195 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 196 { 197 PetscErrorCode ierr; 198 199 PetscFunctionBegin; 200 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 201 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 202 PetscFunctionReturn(0); 203 } 204 205 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 206 { 207 PetscErrorCode ierr; 208 209 PetscFunctionBegin; 210 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 211 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 212 PetscFunctionReturn(0); 213 } 214 215 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 216 { 217 PetscErrorCode ierr; 218 219 PetscFunctionBegin; 220 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 221 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 222 PetscFunctionReturn(0); 223 } 224 225 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 226 { 227 PetscErrorCode ierr; 228 229 PetscFunctionBegin; 230 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 231 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 232 PetscFunctionReturn(0); 233 } 234 235 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 236 { 237 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 238 PetscInt n = A->rmap->n; 239 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 240 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 241 cusparseStatus_t stat; 242 const PetscInt *ai = a->i,*aj = a->j,*vi; 243 const MatScalar *aa = a->a,*v; 244 PetscInt *AiLo, *AjLo; 245 PetscScalar *AALo; 246 PetscInt i,nz, nzLower, offset, rowOffset; 247 PetscErrorCode ierr; 248 249 PetscFunctionBegin; 250 if (!n) PetscFunctionReturn(0); 251 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 252 try { 253 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 254 nzLower=n+ai[n]-ai[1]; 255 256 /* Allocate Space for the lower triangular matrix */ 257 ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 258 ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr); 259 ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr); 260 261 /* Fill the lower triangular matrix */ 262 AiLo[0] = (PetscInt) 0; 263 AiLo[n] = nzLower; 264 AjLo[0] = (PetscInt) 0; 265 AALo[0] = (MatScalar) 1.0; 266 v = aa; 267 vi = aj; 268 offset = 1; 269 rowOffset= 1; 270 for (i=1; i<n; i++) { 271 nz = ai[i+1] - ai[i]; 272 /* additional 1 for the term on the diagonal */ 273 AiLo[i] = rowOffset; 274 rowOffset += nz+1; 275 276 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 277 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 278 279 offset += nz; 280 AjLo[offset] = (PetscInt) i; 281 AALo[offset] = (MatScalar) 1.0; 282 offset += 1; 283 284 v += nz; 285 vi += nz; 286 } 287 288 /* allocate space for the triangular factor information */ 289 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 290 291 /* Create the matrix description */ 292 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 293 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 294 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 295 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat); 296 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 297 298 /* Create the solve analysis information */ 299 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 300 301 /* set the operation */ 302 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 303 304 /* set the matrix */ 305 loTriFactor->csrMat = new CsrMatrix; 306 loTriFactor->csrMat->num_rows = n; 307 loTriFactor->csrMat->num_cols = n; 308 loTriFactor->csrMat->num_entries = nzLower; 309 310 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 311 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 312 313 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 314 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 315 316 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 317 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 318 319 /* perform the solve analysis */ 320 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 321 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 322 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 323 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 324 325 /* assign the pointer. Is this really necessary? */ 326 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 327 328 ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr); 329 ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr); 330 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 331 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 332 } catch(char *ex) { 333 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 334 } 335 } 336 PetscFunctionReturn(0); 337 } 338 339 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 340 { 341 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 342 PetscInt n = A->rmap->n; 343 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 344 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 345 cusparseStatus_t stat; 346 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 347 const MatScalar *aa = a->a,*v; 348 PetscInt *AiUp, *AjUp; 349 PetscScalar *AAUp; 350 PetscInt i,nz, nzUpper, offset; 351 PetscErrorCode ierr; 352 353 PetscFunctionBegin; 354 if (!n) PetscFunctionReturn(0); 355 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 356 try { 357 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 358 nzUpper = adiag[0]-adiag[n]; 359 360 /* Allocate Space for the upper triangular matrix */ 361 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 362 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 363 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 364 365 /* Fill the upper triangular matrix */ 366 AiUp[0]=(PetscInt) 0; 367 AiUp[n]=nzUpper; 368 offset = nzUpper; 369 for (i=n-1; i>=0; i--) { 370 v = aa + adiag[i+1] + 1; 371 vi = aj + adiag[i+1] + 1; 372 373 /* number of elements NOT on the diagonal */ 374 nz = adiag[i] - adiag[i+1]-1; 375 376 /* decrement the offset */ 377 offset -= (nz+1); 378 379 /* first, set the diagonal elements */ 380 AjUp[offset] = (PetscInt) i; 381 AAUp[offset] = (MatScalar)1./v[nz]; 382 AiUp[i] = AiUp[i+1] - (nz+1); 383 384 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 385 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 386 } 387 388 /* allocate space for the triangular factor information */ 389 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 390 391 /* Create the matrix description */ 392 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 393 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 394 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 395 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 396 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 397 398 /* Create the solve analysis information */ 399 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 400 401 /* set the operation */ 402 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 403 404 /* set the matrix */ 405 upTriFactor->csrMat = new CsrMatrix; 406 upTriFactor->csrMat->num_rows = n; 407 upTriFactor->csrMat->num_cols = n; 408 upTriFactor->csrMat->num_entries = nzUpper; 409 410 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 411 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 412 413 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 414 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 415 416 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 417 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 418 419 /* perform the solve analysis */ 420 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 421 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 422 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 423 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 424 425 /* assign the pointer. Is this really necessary? */ 426 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 427 428 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 429 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 430 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 431 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 432 } catch(char *ex) { 433 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 434 } 435 } 436 PetscFunctionReturn(0); 437 } 438 439 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 440 { 441 PetscErrorCode ierr; 442 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 443 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 444 IS isrow = a->row,iscol = a->icol; 445 PetscBool row_identity,col_identity; 446 const PetscInt *r,*c; 447 PetscInt n = A->rmap->n; 448 449 PetscFunctionBegin; 450 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 451 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 452 453 cusparseTriFactors->workVector = new THRUSTARRAY(n); 454 cusparseTriFactors->nnz=a->nz; 455 456 A->offloadmask = PETSC_OFFLOAD_BOTH; 457 /* lower triangular indices */ 458 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 459 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 460 if (!row_identity) { 461 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 462 cusparseTriFactors->rpermIndices->assign(r, r+n); 463 } 464 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 465 466 /* upper triangular indices */ 467 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 468 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 469 if (!col_identity) { 470 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 471 cusparseTriFactors->cpermIndices->assign(c, c+n); 472 } 473 474 if (!row_identity && !col_identity) { 475 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 476 } else if(!row_identity) { 477 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 478 } else if(!col_identity) { 479 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 480 } 481 482 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 483 PetscFunctionReturn(0); 484 } 485 486 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 487 { 488 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 489 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 490 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 491 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 492 cusparseStatus_t stat; 493 PetscErrorCode ierr; 494 PetscInt *AiUp, *AjUp; 495 PetscScalar *AAUp; 496 PetscScalar *AALo; 497 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 498 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 499 const PetscInt *ai = b->i,*aj = b->j,*vj; 500 const MatScalar *aa = b->a,*v; 501 502 PetscFunctionBegin; 503 if (!n) PetscFunctionReturn(0); 504 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 505 try { 506 /* Allocate Space for the upper triangular matrix */ 507 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 508 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 509 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 510 ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 511 512 /* Fill the upper triangular matrix */ 513 AiUp[0]=(PetscInt) 0; 514 AiUp[n]=nzUpper; 515 offset = 0; 516 for (i=0; i<n; i++) { 517 /* set the pointers */ 518 v = aa + ai[i]; 519 vj = aj + ai[i]; 520 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 521 522 /* first, set the diagonal elements */ 523 AjUp[offset] = (PetscInt) i; 524 AAUp[offset] = (MatScalar)1.0/v[nz]; 525 AiUp[i] = offset; 526 AALo[offset] = (MatScalar)1.0/v[nz]; 527 528 offset+=1; 529 if (nz>0) { 530 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 531 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 532 for (j=offset; j<offset+nz; j++) { 533 AAUp[j] = -AAUp[j]; 534 AALo[j] = AAUp[j]/v[nz]; 535 } 536 offset+=nz; 537 } 538 } 539 540 /* allocate space for the triangular factor information */ 541 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 542 543 /* Create the matrix description */ 544 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 545 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 546 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 547 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 548 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 549 550 /* Create the solve analysis information */ 551 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 552 553 /* set the operation */ 554 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 555 556 /* set the matrix */ 557 upTriFactor->csrMat = new CsrMatrix; 558 upTriFactor->csrMat->num_rows = A->rmap->n; 559 upTriFactor->csrMat->num_cols = A->cmap->n; 560 upTriFactor->csrMat->num_entries = a->nz; 561 562 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 563 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 564 565 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 566 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 567 568 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 569 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 570 571 /* perform the solve analysis */ 572 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 573 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 574 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 575 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 576 577 /* assign the pointer. Is this really necessary? */ 578 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 579 580 /* allocate space for the triangular factor information */ 581 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 582 583 /* Create the matrix description */ 584 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 585 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 586 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 587 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 588 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 589 590 /* Create the solve analysis information */ 591 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 592 593 /* set the operation */ 594 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 595 596 /* set the matrix */ 597 loTriFactor->csrMat = new CsrMatrix; 598 loTriFactor->csrMat->num_rows = A->rmap->n; 599 loTriFactor->csrMat->num_cols = A->cmap->n; 600 loTriFactor->csrMat->num_entries = a->nz; 601 602 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 603 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 604 605 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 606 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 607 608 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 609 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 610 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 611 612 /* perform the solve analysis */ 613 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 614 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 615 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 616 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 617 618 /* assign the pointer. Is this really necessary? */ 619 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 620 621 A->offloadmask = PETSC_OFFLOAD_BOTH; 622 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 623 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 624 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 625 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 626 } catch(char *ex) { 627 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 628 } 629 } 630 PetscFunctionReturn(0); 631 } 632 633 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 634 { 635 PetscErrorCode ierr; 636 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 637 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 638 IS ip = a->row; 639 const PetscInt *rip; 640 PetscBool perm_identity; 641 PetscInt n = A->rmap->n; 642 643 PetscFunctionBegin; 644 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 645 cusparseTriFactors->workVector = new THRUSTARRAY(n); 646 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 647 648 /* lower triangular indices */ 649 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 650 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 651 if (!perm_identity) { 652 IS iip; 653 const PetscInt *irip; 654 655 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 656 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 657 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 658 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 659 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 660 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 661 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 662 ierr = ISDestroy(&iip);CHKERRQ(ierr); 663 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 664 } 665 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 666 PetscFunctionReturn(0); 667 } 668 669 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 670 { 671 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 672 IS isrow = b->row,iscol = b->col; 673 PetscBool row_identity,col_identity; 674 PetscErrorCode ierr; 675 676 PetscFunctionBegin; 677 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 678 /* determine which version of MatSolve needs to be used. */ 679 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 680 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 681 if (row_identity && col_identity) { 682 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 683 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 684 B->ops->matsolve = NULL; 685 B->ops->matsolvetranspose = NULL; 686 } else { 687 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 688 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 689 B->ops->matsolve = NULL; 690 B->ops->matsolvetranspose = NULL; 691 } 692 693 /* get the triangular factors */ 694 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 695 PetscFunctionReturn(0); 696 } 697 698 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 699 { 700 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 701 IS ip = b->row; 702 PetscBool perm_identity; 703 PetscErrorCode ierr; 704 705 PetscFunctionBegin; 706 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 707 708 /* determine which version of MatSolve needs to be used. */ 709 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 710 if (perm_identity) { 711 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 712 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 713 B->ops->matsolve = NULL; 714 B->ops->matsolvetranspose = NULL; 715 } else { 716 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 717 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 718 B->ops->matsolve = NULL; 719 B->ops->matsolvetranspose = NULL; 720 } 721 722 /* get the triangular factors */ 723 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 724 PetscFunctionReturn(0); 725 } 726 727 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 728 { 729 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 730 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 731 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 732 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 733 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 734 cusparseStatus_t stat; 735 cusparseIndexBase_t indexBase; 736 cusparseMatrixType_t matrixType; 737 cusparseFillMode_t fillMode; 738 cusparseDiagType_t diagType; 739 740 PetscFunctionBegin; 741 742 /*********************************************/ 743 /* Now the Transpose of the Lower Tri Factor */ 744 /*********************************************/ 745 746 /* allocate space for the transpose of the lower triangular factor */ 747 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 748 749 /* set the matrix descriptors of the lower triangular factor */ 750 matrixType = cusparseGetMatType(loTriFactor->descr); 751 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 752 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 753 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 754 diagType = cusparseGetMatDiagType(loTriFactor->descr); 755 756 /* Create the matrix description */ 757 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat); 758 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat); 759 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat); 760 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat); 761 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat); 762 763 /* Create the solve analysis information */ 764 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat); 765 766 /* set the operation */ 767 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 768 769 /* allocate GPU space for the CSC of the lower triangular factor*/ 770 loTriFactorT->csrMat = new CsrMatrix; 771 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 772 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 773 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 774 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 775 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 776 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 777 778 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 779 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 780 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 781 loTriFactor->csrMat->values->data().get(), 782 loTriFactor->csrMat->row_offsets->data().get(), 783 loTriFactor->csrMat->column_indices->data().get(), 784 loTriFactorT->csrMat->values->data().get(), 785 loTriFactorT->csrMat->column_indices->data().get(), 786 loTriFactorT->csrMat->row_offsets->data().get(), 787 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 788 789 /* perform the solve analysis on the transposed matrix */ 790 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 791 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 792 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 793 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 794 loTriFactorT->solveInfo);CHKERRCUDA(stat); 795 796 /* assign the pointer. Is this really necessary? */ 797 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 798 799 /*********************************************/ 800 /* Now the Transpose of the Upper Tri Factor */ 801 /*********************************************/ 802 803 /* allocate space for the transpose of the upper triangular factor */ 804 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 805 806 /* set the matrix descriptors of the upper triangular factor */ 807 matrixType = cusparseGetMatType(upTriFactor->descr); 808 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 809 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 810 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 811 diagType = cusparseGetMatDiagType(upTriFactor->descr); 812 813 /* Create the matrix description */ 814 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat); 815 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat); 816 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat); 817 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat); 818 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat); 819 820 /* Create the solve analysis information */ 821 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat); 822 823 /* set the operation */ 824 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 825 826 /* allocate GPU space for the CSC of the upper triangular factor*/ 827 upTriFactorT->csrMat = new CsrMatrix; 828 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 829 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 830 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 831 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 832 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 833 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 834 835 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 836 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 837 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 838 upTriFactor->csrMat->values->data().get(), 839 upTriFactor->csrMat->row_offsets->data().get(), 840 upTriFactor->csrMat->column_indices->data().get(), 841 upTriFactorT->csrMat->values->data().get(), 842 upTriFactorT->csrMat->column_indices->data().get(), 843 upTriFactorT->csrMat->row_offsets->data().get(), 844 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 845 846 /* perform the solve analysis on the transposed matrix */ 847 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 848 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 849 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 850 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 851 upTriFactorT->solveInfo);CHKERRCUDA(stat); 852 853 /* assign the pointer. Is this really necessary? */ 854 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 855 PetscFunctionReturn(0); 856 } 857 858 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 859 { 860 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 861 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 862 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 863 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 864 cusparseStatus_t stat; 865 cusparseIndexBase_t indexBase; 866 cudaError_t err; 867 PetscErrorCode ierr; 868 869 PetscFunctionBegin; 870 871 /* allocate space for the triangular factor information */ 872 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 873 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat); 874 indexBase = cusparseGetMatIndexBase(matstruct->descr); 875 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat); 876 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 877 878 /* set alpha and beta */ 879 err = cudaMalloc((void **)&(matstructT->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 880 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 881 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 882 err = cudaMemcpy(matstructT->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 883 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 884 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 885 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 886 887 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 888 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 889 CsrMatrix *matrixT= new CsrMatrix; 890 thrust::device_vector<int> *rowoffsets_gpu; 891 const int ncomprow = matstruct->cprowIndices->size(); 892 thrust::host_vector<int> cprow(ncomprow); 893 cprow = *matstruct->cprowIndices; // GPU --> CPU 894 matrixT->num_rows = A->cmap->n; 895 matrixT->num_cols = A->rmap->n; 896 matrixT->num_entries = a->nz; 897 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 898 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 899 matrixT->values = new THRUSTARRAY(a->nz); 900 PetscInt k,i; 901 thrust::host_vector<int> rowst_host(A->rmap->n+1); 902 903 /* expand compress rows, which is forced in constructor (MatSeqAIJCUSPARSECopyToGPU) */ 904 rowst_host[0] = 0; 905 for (k = 0, i = 0; i < A->rmap->n ; i++) { 906 if (k < ncomprow && i==cprow[k]) { 907 rowst_host[i+1] = a->i[i+1]; 908 k++; 909 } else rowst_host[i+1] = rowst_host[i]; 910 } 911 if (k!=ncomprow) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"%D != %D",k,ncomprow); 912 rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 913 *rowoffsets_gpu = rowst_host; // CPU --> GPU 914 915 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 916 indexBase = cusparseGetMatIndexBase(matstruct->descr); 917 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 918 A->cmap->n, matrix->num_entries, 919 matrix->values->data().get(), 920 rowoffsets_gpu->data().get(), 921 matrix->column_indices->data().get(), 922 matrixT->values->data().get(), 923 matrixT->column_indices->data().get(), 924 matrixT->row_offsets->data().get(), 925 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 926 927 /* assign the pointer */ 928 matstructT->mat = matrixT; 929 ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 930 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 931 /* First convert HYB to CSR */ 932 CsrMatrix *temp= new CsrMatrix; 933 temp->num_rows = A->rmap->n; 934 temp->num_cols = A->cmap->n; 935 temp->num_entries = a->nz; 936 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 937 temp->column_indices = new THRUSTINTARRAY32(a->nz); 938 temp->values = new THRUSTARRAY(a->nz); 939 940 941 stat = cusparse_hyb2csr(cusparsestruct->handle, 942 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 943 temp->values->data().get(), 944 temp->row_offsets->data().get(), 945 temp->column_indices->data().get());CHKERRCUDA(stat); 946 947 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 948 CsrMatrix *tempT= new CsrMatrix; 949 tempT->num_rows = A->rmap->n; 950 tempT->num_cols = A->cmap->n; 951 tempT->num_entries = a->nz; 952 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 953 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 954 tempT->values = new THRUSTARRAY(a->nz); 955 956 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 957 temp->num_cols, temp->num_entries, 958 temp->values->data().get(), 959 temp->row_offsets->data().get(), 960 temp->column_indices->data().get(), 961 tempT->values->data().get(), 962 tempT->column_indices->data().get(), 963 tempT->row_offsets->data().get(), 964 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 965 966 /* Last, convert CSC to HYB */ 967 cusparseHybMat_t hybMat; 968 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 969 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 970 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 971 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 972 matstructT->descr, tempT->values->data().get(), 973 tempT->row_offsets->data().get(), 974 tempT->column_indices->data().get(), 975 hybMat, 0, partition);CHKERRCUDA(stat); 976 977 /* assign the pointer */ 978 matstructT->mat = hybMat; 979 ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr); 980 981 /* delete temporaries */ 982 if (tempT) { 983 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 984 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 985 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 986 delete (CsrMatrix*) tempT; 987 } 988 if (temp) { 989 if (temp->values) delete (THRUSTARRAY*) temp->values; 990 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 991 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 992 delete (CsrMatrix*) temp; 993 } 994 } 995 /* assign the compressed row indices */ 996 matstructT->cprowIndices = new THRUSTINTARRAY; 997 matstructT->cprowIndices->resize(A->cmap->n); 998 thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end()); 999 /* assign the pointer */ 1000 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1001 PetscFunctionReturn(0); 1002 } 1003 1004 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1005 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1006 { 1007 PetscInt n = xx->map->n; 1008 const PetscScalar *barray; 1009 PetscScalar *xarray; 1010 thrust::device_ptr<const PetscScalar> bGPU; 1011 thrust::device_ptr<PetscScalar> xGPU; 1012 cusparseStatus_t stat; 1013 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1014 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1015 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1016 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1017 PetscErrorCode ierr; 1018 1019 PetscFunctionBegin; 1020 /* Analyze the matrix and create the transpose ... on the fly */ 1021 if (!loTriFactorT && !upTriFactorT) { 1022 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1023 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1024 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1025 } 1026 1027 /* Get the GPU pointers */ 1028 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1029 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1030 xGPU = thrust::device_pointer_cast(xarray); 1031 bGPU = thrust::device_pointer_cast(barray); 1032 1033 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1034 /* First, reorder with the row permutation */ 1035 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1036 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1037 xGPU); 1038 1039 /* First, solve U */ 1040 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1041 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1042 upTriFactorT->csrMat->values->data().get(), 1043 upTriFactorT->csrMat->row_offsets->data().get(), 1044 upTriFactorT->csrMat->column_indices->data().get(), 1045 upTriFactorT->solveInfo, 1046 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1047 1048 /* Then, solve L */ 1049 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1050 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1051 loTriFactorT->csrMat->values->data().get(), 1052 loTriFactorT->csrMat->row_offsets->data().get(), 1053 loTriFactorT->csrMat->column_indices->data().get(), 1054 loTriFactorT->solveInfo, 1055 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1056 1057 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1058 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1059 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1060 tempGPU->begin()); 1061 1062 /* Copy the temporary to the full solution. */ 1063 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1064 1065 /* restore */ 1066 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1067 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1068 ierr = WaitForGPU();CHKERRCUDA(ierr); 1069 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1070 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1071 PetscFunctionReturn(0); 1072 } 1073 1074 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1075 { 1076 const PetscScalar *barray; 1077 PetscScalar *xarray; 1078 cusparseStatus_t stat; 1079 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1080 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1081 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1082 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1083 PetscErrorCode ierr; 1084 1085 PetscFunctionBegin; 1086 /* Analyze the matrix and create the transpose ... on the fly */ 1087 if (!loTriFactorT && !upTriFactorT) { 1088 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1089 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1090 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1091 } 1092 1093 /* Get the GPU pointers */ 1094 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1095 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1096 1097 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1098 /* First, solve U */ 1099 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1100 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1101 upTriFactorT->csrMat->values->data().get(), 1102 upTriFactorT->csrMat->row_offsets->data().get(), 1103 upTriFactorT->csrMat->column_indices->data().get(), 1104 upTriFactorT->solveInfo, 1105 barray, tempGPU->data().get());CHKERRCUDA(stat); 1106 1107 /* Then, solve L */ 1108 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1109 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1110 loTriFactorT->csrMat->values->data().get(), 1111 loTriFactorT->csrMat->row_offsets->data().get(), 1112 loTriFactorT->csrMat->column_indices->data().get(), 1113 loTriFactorT->solveInfo, 1114 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1115 1116 /* restore */ 1117 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1118 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1119 ierr = WaitForGPU();CHKERRCUDA(ierr); 1120 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1121 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1122 PetscFunctionReturn(0); 1123 } 1124 1125 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1126 { 1127 const PetscScalar *barray; 1128 PetscScalar *xarray; 1129 thrust::device_ptr<const PetscScalar> bGPU; 1130 thrust::device_ptr<PetscScalar> xGPU; 1131 cusparseStatus_t stat; 1132 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1133 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1134 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1135 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1136 PetscErrorCode ierr; 1137 1138 PetscFunctionBegin; 1139 1140 /* Get the GPU pointers */ 1141 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1142 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1143 xGPU = thrust::device_pointer_cast(xarray); 1144 bGPU = thrust::device_pointer_cast(barray); 1145 1146 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1147 /* First, reorder with the row permutation */ 1148 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1149 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1150 tempGPU->begin()); 1151 1152 /* Next, solve L */ 1153 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1154 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1155 loTriFactor->csrMat->values->data().get(), 1156 loTriFactor->csrMat->row_offsets->data().get(), 1157 loTriFactor->csrMat->column_indices->data().get(), 1158 loTriFactor->solveInfo, 1159 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1160 1161 /* Then, solve U */ 1162 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1163 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1164 upTriFactor->csrMat->values->data().get(), 1165 upTriFactor->csrMat->row_offsets->data().get(), 1166 upTriFactor->csrMat->column_indices->data().get(), 1167 upTriFactor->solveInfo, 1168 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1169 1170 /* Last, reorder with the column permutation */ 1171 thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1172 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1173 xGPU); 1174 1175 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1176 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1177 ierr = WaitForGPU();CHKERRCUDA(ierr); 1178 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1179 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1180 PetscFunctionReturn(0); 1181 } 1182 1183 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1184 { 1185 const PetscScalar *barray; 1186 PetscScalar *xarray; 1187 cusparseStatus_t stat; 1188 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1189 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1190 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1191 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1192 PetscErrorCode ierr; 1193 1194 PetscFunctionBegin; 1195 /* Get the GPU pointers */ 1196 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1197 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1198 1199 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1200 /* First, solve L */ 1201 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1202 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1203 loTriFactor->csrMat->values->data().get(), 1204 loTriFactor->csrMat->row_offsets->data().get(), 1205 loTriFactor->csrMat->column_indices->data().get(), 1206 loTriFactor->solveInfo, 1207 barray, tempGPU->data().get());CHKERRCUDA(stat); 1208 1209 /* Next, solve U */ 1210 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1211 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1212 upTriFactor->csrMat->values->data().get(), 1213 upTriFactor->csrMat->row_offsets->data().get(), 1214 upTriFactor->csrMat->column_indices->data().get(), 1215 upTriFactor->solveInfo, 1216 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1217 1218 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1219 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1220 ierr = WaitForGPU();CHKERRCUDA(ierr); 1221 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1222 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1223 PetscFunctionReturn(0); 1224 } 1225 1226 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1227 { 1228 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1229 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1230 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1231 PetscInt m = A->rmap->n,*ii,*ridx; 1232 PetscErrorCode ierr; 1233 cusparseStatus_t stat; 1234 cudaError_t err; 1235 1236 PetscFunctionBegin; 1237 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1238 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1239 if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1240 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1241 /* copy values only */ 1242 matrix->values->assign(a->a, a->a+a->nz); 1243 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1244 } else { 1245 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1246 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr); 1247 delete cusparsestruct->workVector; 1248 try { 1249 cusparsestruct->nonzerorow=0; 1250 for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0); 1251 1252 if (a->compressedrow.use) { 1253 m = a->compressedrow.nrows; 1254 ii = a->compressedrow.i; 1255 ridx = a->compressedrow.rindex; 1256 } else { 1257 /* Forcing compressed row on the GPU */ 1258 int k=0; 1259 ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr); 1260 ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr); 1261 ii[0]=0; 1262 for (int j = 0; j<m; j++) { 1263 if ((a->i[j+1]-a->i[j])>0) { 1264 ii[k] = a->i[j]; 1265 ridx[k]= j; 1266 k++; 1267 } 1268 } 1269 ii[cusparsestruct->nonzerorow] = a->nz; 1270 m = cusparsestruct->nonzerorow; 1271 } 1272 1273 /* allocate space for the triangular factor information */ 1274 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1275 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat); 1276 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 1277 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 1278 1279 err = cudaMalloc((void **)&(matstruct->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 1280 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1281 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1282 err = cudaMemcpy(matstruct->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1283 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1284 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1285 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 1286 1287 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1288 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1289 /* set the matrix */ 1290 CsrMatrix *matrix= new CsrMatrix; 1291 matrix->num_rows = m; 1292 matrix->num_cols = A->cmap->n; 1293 matrix->num_entries = a->nz; 1294 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1295 matrix->row_offsets->assign(ii, ii + m+1); 1296 1297 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1298 matrix->column_indices->assign(a->j, a->j+a->nz); 1299 1300 matrix->values = new THRUSTARRAY(a->nz); 1301 matrix->values->assign(a->a, a->a+a->nz); 1302 1303 /* assign the pointer */ 1304 matstruct->mat = matrix; 1305 1306 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1307 CsrMatrix *matrix= new CsrMatrix; 1308 matrix->num_rows = m; 1309 matrix->num_cols = A->cmap->n; 1310 matrix->num_entries = a->nz; 1311 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1312 matrix->row_offsets->assign(ii, ii + m+1); 1313 1314 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1315 matrix->column_indices->assign(a->j, a->j+a->nz); 1316 1317 matrix->values = new THRUSTARRAY(a->nz); 1318 matrix->values->assign(a->a, a->a+a->nz); 1319 1320 cusparseHybMat_t hybMat; 1321 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 1322 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1323 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1324 stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1325 matstruct->descr, matrix->values->data().get(), 1326 matrix->row_offsets->data().get(), 1327 matrix->column_indices->data().get(), 1328 hybMat, 0, partition);CHKERRCUDA(stat); 1329 /* assign the pointer */ 1330 matstruct->mat = hybMat; 1331 1332 if (matrix) { 1333 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1334 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1335 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1336 delete (CsrMatrix*)matrix; 1337 } 1338 } 1339 1340 /* assign the compressed row indices */ 1341 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1342 matstruct->cprowIndices->assign(ridx,ridx+m); 1343 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+m*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1344 1345 /* assign the pointer */ 1346 cusparsestruct->mat = matstruct; 1347 1348 if (!a->compressedrow.use) { 1349 ierr = PetscFree(ii);CHKERRQ(ierr); 1350 ierr = PetscFree(ridx);CHKERRQ(ierr); 1351 } 1352 cusparsestruct->workVector = new THRUSTARRAY(m); 1353 } catch(char *ex) { 1354 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1355 } 1356 cusparsestruct->nonzerostate = A->nonzerostate; 1357 } 1358 ierr = WaitForGPU();CHKERRCUDA(ierr); 1359 A->offloadmask = PETSC_OFFLOAD_BOTH; 1360 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1361 } 1362 PetscFunctionReturn(0); 1363 } 1364 1365 struct VecCUDAPlusEquals 1366 { 1367 template <typename Tuple> 1368 __host__ __device__ 1369 void operator()(Tuple t) 1370 { 1371 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1372 } 1373 }; 1374 1375 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1376 { 1377 PetscErrorCode ierr; 1378 1379 PetscFunctionBegin; 1380 ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr); 1381 PetscFunctionReturn(0); 1382 } 1383 1384 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1385 { 1386 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1387 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1388 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1389 const PetscScalar *xarray; 1390 PetscScalar *yarray; 1391 PetscErrorCode ierr; 1392 cusparseStatus_t stat; 1393 1394 PetscFunctionBegin; 1395 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1396 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1397 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1398 if (!matstructT) { 1399 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1400 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1401 } 1402 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1403 ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1404 if (yy->map->n) { 1405 PetscInt n = yy->map->n; 1406 cudaError_t err; 1407 err = cudaMemset(yarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err); /* hack to fix numerical errors from reading output vector yy, apparently */ 1408 } 1409 1410 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1411 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1412 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1413 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1414 mat->num_rows, mat->num_cols, 1415 mat->num_entries, matstructT->alpha, matstructT->descr, 1416 mat->values->data().get(), mat->row_offsets->data().get(), 1417 mat->column_indices->data().get(), xarray, matstructT->beta_zero, 1418 yarray);CHKERRCUDA(stat); 1419 } else { 1420 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1421 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1422 matstructT->alpha, matstructT->descr, hybMat, 1423 xarray, matstructT->beta_zero, 1424 yarray);CHKERRCUDA(stat); 1425 } 1426 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1427 ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1428 ierr = WaitForGPU();CHKERRCUDA(ierr); 1429 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1430 ierr = PetscLogGpuFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1431 PetscFunctionReturn(0); 1432 } 1433 1434 1435 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1436 { 1437 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1438 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1439 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 1440 const PetscScalar *xarray; 1441 PetscScalar *zarray,*dptr,*beta; 1442 PetscErrorCode ierr; 1443 cusparseStatus_t stat; 1444 PetscBool cmpr; /* if the matrix has been compressed (zero rows) */ 1445 1446 PetscFunctionBegin; 1447 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1448 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1449 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1450 try { 1451 cmpr = (PetscBool)(cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n)); 1452 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1453 if (yy && !cmpr) { /* MatMultAdd with noncompressed storage -> need uptodate zz vector */ 1454 ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr); 1455 } else { 1456 ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1457 } 1458 dptr = cmpr ? zarray : cusparsestruct->workVector->data().get(); 1459 beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero; 1460 1461 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1462 /* csr_spmv is multiply add */ 1463 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1464 /* here we need to be careful to set the number of rows in the multiply to the 1465 number of compressed rows in the matrix ... which is equivalent to the 1466 size of the workVector */ 1467 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1468 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1469 mat->num_rows, mat->num_cols, 1470 mat->num_entries, matstruct->alpha, matstruct->descr, 1471 mat->values->data().get(), mat->row_offsets->data().get(), 1472 mat->column_indices->data().get(), xarray, beta, 1473 dptr);CHKERRCUDA(stat); 1474 } else { 1475 if (cusparsestruct->workVector->size()) { 1476 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1477 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1478 matstruct->alpha, matstruct->descr, hybMat, 1479 xarray, beta, 1480 dptr);CHKERRCUDA(stat); 1481 } 1482 } 1483 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1484 1485 if (yy) { 1486 if (dptr != zarray) { 1487 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1488 } else if (zz != yy) { 1489 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); 1490 } 1491 } else if (dptr != zarray) { 1492 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 1493 } 1494 /* scatter the data from the temporary into the full vector with a += operation */ 1495 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1496 if (dptr != zarray) { 1497 thrust::device_ptr<PetscScalar> zptr; 1498 1499 zptr = thrust::device_pointer_cast(zarray); 1500 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 1501 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1502 VecCUDAPlusEquals()); 1503 } 1504 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1505 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1506 if (yy && !cmpr) { 1507 ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr); 1508 } else { 1509 ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1510 } 1511 } catch(char *ex) { 1512 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1513 } 1514 ierr = WaitForGPU();CHKERRCUDA(ierr); 1515 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1516 PetscFunctionReturn(0); 1517 } 1518 1519 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1520 { 1521 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1522 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1523 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1524 const PetscScalar *xarray; 1525 PetscScalar *zarray,*dptr,*beta; 1526 PetscErrorCode ierr; 1527 cusparseStatus_t stat; 1528 1529 PetscFunctionBegin; 1530 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1531 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1532 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1533 if (!matstructT) { 1534 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1535 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1536 } 1537 1538 try { 1539 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1540 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1541 ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr); 1542 dptr = cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->cmap->n) ? zarray : cusparsestruct->workVector->data().get(); 1543 beta = (yy == zz && dptr == zarray) ? matstructT->beta_one : matstructT->beta_zero; 1544 1545 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1546 /* multiply add with matrix transpose */ 1547 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1548 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1549 /* here we need to be careful to set the number of rows in the multiply to the 1550 number of compressed rows in the matrix ... which is equivalent to the 1551 size of the workVector */ 1552 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1553 mat->num_rows, mat->num_cols, 1554 mat->num_entries, matstructT->alpha, matstructT->descr, 1555 mat->values->data().get(), mat->row_offsets->data().get(), 1556 mat->column_indices->data().get(), xarray, beta, 1557 dptr);CHKERRCUDA(stat); 1558 } else { 1559 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1560 if (cusparsestruct->workVector->size()) { 1561 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1562 matstructT->alpha, matstructT->descr, hybMat, 1563 xarray, beta, 1564 dptr);CHKERRCUDA(stat); 1565 } 1566 } 1567 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1568 1569 if (dptr != zarray) { 1570 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1571 } else if (zz != yy) { 1572 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); 1573 } 1574 /* scatter the data from the temporary into the full vector with a += operation */ 1575 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1576 if (dptr != zarray) { 1577 thrust::device_ptr<PetscScalar> zptr; 1578 1579 zptr = thrust::device_pointer_cast(zarray); 1580 1581 /* scatter the data from the temporary into the full vector with a += operation */ 1582 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))), 1583 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n, 1584 VecCUDAPlusEquals()); 1585 } 1586 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1587 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1588 ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr); 1589 } catch(char *ex) { 1590 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1591 } 1592 ierr = WaitForGPU();CHKERRCUDA(ierr); /* is this needed? just for yy==0 in Mult */ 1593 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1594 PetscFunctionReturn(0); 1595 } 1596 1597 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1598 { 1599 PetscErrorCode ierr; 1600 1601 PetscFunctionBegin; 1602 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1603 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1604 if (A->factortype == MAT_FACTOR_NONE) { 1605 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1606 } 1607 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1608 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1609 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1610 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1611 PetscFunctionReturn(0); 1612 } 1613 1614 /* --------------------------------------------------------------------------------*/ 1615 /*@ 1616 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1617 (the default parallel PETSc format). This matrix will ultimately pushed down 1618 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1619 assembly performance the user should preallocate the matrix storage by setting 1620 the parameter nz (or the array nnz). By setting these parameters accurately, 1621 performance during matrix assembly can be increased by more than a factor of 50. 1622 1623 Collective 1624 1625 Input Parameters: 1626 + comm - MPI communicator, set to PETSC_COMM_SELF 1627 . m - number of rows 1628 . n - number of columns 1629 . nz - number of nonzeros per row (same for all rows) 1630 - nnz - array containing the number of nonzeros in the various rows 1631 (possibly different for each row) or NULL 1632 1633 Output Parameter: 1634 . A - the matrix 1635 1636 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1637 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1638 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1639 1640 Notes: 1641 If nnz is given then nz is ignored 1642 1643 The AIJ format (also called the Yale sparse matrix format or 1644 compressed row storage), is fully compatible with standard Fortran 77 1645 storage. That is, the stored row and column indices can begin at 1646 either one (as in Fortran) or zero. See the users' manual for details. 1647 1648 Specify the preallocated storage with either nz or nnz (not both). 1649 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1650 allocation. For large problems you MUST preallocate memory or you 1651 will get TERRIBLE performance, see the users' manual chapter on matrices. 1652 1653 By default, this format uses inodes (identical nodes) when possible, to 1654 improve numerical efficiency of matrix-vector products and solves. We 1655 search for consecutive rows with the same nonzero structure, thereby 1656 reusing matrix information to achieve increased efficiency. 1657 1658 Level: intermediate 1659 1660 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1661 @*/ 1662 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1663 { 1664 PetscErrorCode ierr; 1665 1666 PetscFunctionBegin; 1667 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1668 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1669 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1670 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1671 PetscFunctionReturn(0); 1672 } 1673 1674 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1675 { 1676 PetscErrorCode ierr; 1677 1678 PetscFunctionBegin; 1679 if (A->factortype==MAT_FACTOR_NONE) { 1680 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) { 1681 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 1682 } 1683 } else { 1684 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 1685 } 1686 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1687 PetscFunctionReturn(0); 1688 } 1689 1690 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 1691 { 1692 PetscErrorCode ierr; 1693 Mat C; 1694 cusparseStatus_t stat; 1695 cusparseHandle_t handle=0; 1696 1697 PetscFunctionBegin; 1698 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 1699 C = *B; 1700 ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr); 1701 ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr); 1702 1703 /* inject CUSPARSE-specific stuff */ 1704 if (C->factortype==MAT_FACTOR_NONE) { 1705 /* you cannot check the inode.use flag here since the matrix was just created. 1706 now build a GPU matrix data structure */ 1707 C->spptr = new Mat_SeqAIJCUSPARSE; 1708 ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat = 0; 1709 ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0; 1710 ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector = 0; 1711 ((Mat_SeqAIJCUSPARSE*)C->spptr)->format = MAT_CUSPARSE_CSR; 1712 ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream = 0; 1713 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1714 ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle = handle; 1715 ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0; 1716 } else { 1717 /* NEXT, set the pointers to the triangular factors */ 1718 C->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1719 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr = 0; 1720 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr = 0; 1721 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0; 1722 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0; 1723 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices = 0; 1724 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices = 0; 1725 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector = 0; 1726 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = 0; 1727 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1728 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = handle; 1729 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz = 0; 1730 } 1731 1732 C->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1733 C->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1734 C->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1735 C->ops->mult = MatMult_SeqAIJCUSPARSE; 1736 C->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1737 C->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1738 C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1739 C->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1740 1741 ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1742 1743 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 1744 1745 ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1746 PetscFunctionReturn(0); 1747 } 1748 1749 PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B) 1750 { 1751 PetscErrorCode ierr; 1752 cusparseStatus_t stat; 1753 cusparseHandle_t handle=0; 1754 1755 PetscFunctionBegin; 1756 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 1757 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 1758 1759 if (B->factortype==MAT_FACTOR_NONE) { 1760 /* you cannot check the inode.use flag here since the matrix was just created. 1761 now build a GPU matrix data structure */ 1762 B->spptr = new Mat_SeqAIJCUSPARSE; 1763 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1764 ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0; 1765 ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0; 1766 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1767 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1768 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1769 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle; 1770 ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0; 1771 } else { 1772 /* NEXT, set the pointers to the triangular factors */ 1773 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1774 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 1775 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1776 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0; 1777 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0; 1778 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0; 1779 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0; 1780 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0; 1781 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1782 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle; 1783 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0; 1784 } 1785 1786 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1787 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1788 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1789 B->ops->mult = MatMult_SeqAIJCUSPARSE; 1790 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1791 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1792 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1793 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1794 1795 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1796 1797 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 1798 1799 ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1800 PetscFunctionReturn(0); 1801 } 1802 1803 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 1804 { 1805 PetscErrorCode ierr; 1806 1807 PetscFunctionBegin; 1808 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 1809 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);CHKERRQ(ierr); 1810 PetscFunctionReturn(0); 1811 } 1812 1813 /*MC 1814 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1815 1816 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1817 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 1818 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 1819 1820 Options Database Keys: 1821 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1822 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1823 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1824 1825 Level: beginner 1826 1827 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1828 M*/ 1829 1830 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 1831 1832 1833 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 1834 { 1835 PetscErrorCode ierr; 1836 1837 PetscFunctionBegin; 1838 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1839 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1840 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1841 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1842 PetscFunctionReturn(0); 1843 } 1844 1845 1846 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 1847 { 1848 cusparseStatus_t stat; 1849 cusparseHandle_t handle; 1850 1851 PetscFunctionBegin; 1852 if (*cusparsestruct) { 1853 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format); 1854 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format); 1855 delete (*cusparsestruct)->workVector; 1856 if (handle = (*cusparsestruct)->handle) { 1857 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1858 } 1859 delete *cusparsestruct; 1860 *cusparsestruct = 0; 1861 } 1862 PetscFunctionReturn(0); 1863 } 1864 1865 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 1866 { 1867 PetscFunctionBegin; 1868 if (*mat) { 1869 delete (*mat)->values; 1870 delete (*mat)->column_indices; 1871 delete (*mat)->row_offsets; 1872 delete *mat; 1873 *mat = 0; 1874 } 1875 PetscFunctionReturn(0); 1876 } 1877 1878 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 1879 { 1880 cusparseStatus_t stat; 1881 PetscErrorCode ierr; 1882 1883 PetscFunctionBegin; 1884 if (*trifactor) { 1885 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); } 1886 if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); } 1887 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 1888 delete *trifactor; 1889 *trifactor = 0; 1890 } 1891 PetscFunctionReturn(0); 1892 } 1893 1894 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 1895 { 1896 CsrMatrix *mat; 1897 cusparseStatus_t stat; 1898 cudaError_t err; 1899 1900 PetscFunctionBegin; 1901 if (*matstruct) { 1902 if ((*matstruct)->mat) { 1903 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 1904 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 1905 stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat); 1906 } else { 1907 mat = (CsrMatrix*)(*matstruct)->mat; 1908 CsrMatrix_Destroy(&mat); 1909 } 1910 } 1911 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); } 1912 delete (*matstruct)->cprowIndices; 1913 if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); } 1914 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 1915 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 1916 delete *matstruct; 1917 *matstruct = 0; 1918 } 1919 PetscFunctionReturn(0); 1920 } 1921 1922 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 1923 { 1924 cusparseHandle_t handle; 1925 cusparseStatus_t stat; 1926 1927 PetscFunctionBegin; 1928 if (*trifactors) { 1929 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr); 1930 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr); 1931 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose); 1932 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose); 1933 delete (*trifactors)->rpermIndices; 1934 delete (*trifactors)->cpermIndices; 1935 delete (*trifactors)->workVector; 1936 if (handle = (*trifactors)->handle) { 1937 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1938 } 1939 delete *trifactors; 1940 *trifactors = 0; 1941 } 1942 PetscFunctionReturn(0); 1943 } 1944 1945