1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_CXX_COMPLEX_FIX 7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 8 9 #include <petscconf.h> 10 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11 #include <../src/mat/impls/sbaij/seq/sbaij.h> 12 #include <../src/vec/vec/impls/dvecimpl.h> 13 #include <petsc/private/vecimpl.h> 14 #undef VecType 15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 19 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 20 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 21 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 22 23 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 24 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 25 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 26 27 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 28 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 29 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 30 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 31 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 32 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 33 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 34 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 35 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 36 37 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 38 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 39 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 40 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 41 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 42 43 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 44 { 45 cusparseStatus_t stat; 46 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 47 48 PetscFunctionBegin; 49 cusparsestruct->stream = stream; 50 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat); 51 PetscFunctionReturn(0); 52 } 53 54 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 55 { 56 cusparseStatus_t stat; 57 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 58 59 PetscFunctionBegin; 60 if (cusparsestruct->handle != handle) { 61 if (cusparsestruct->handle) { 62 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat); 63 } 64 cusparsestruct->handle = handle; 65 } 66 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 67 PetscFunctionReturn(0); 68 } 69 70 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 71 { 72 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 73 PetscFunctionBegin; 74 if (cusparsestruct->handle) 75 cusparsestruct->handle = 0; 76 PetscFunctionReturn(0); 77 } 78 79 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 80 { 81 PetscFunctionBegin; 82 *type = MATSOLVERCUSPARSE; 83 PetscFunctionReturn(0); 84 } 85 86 /*MC 87 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 88 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 89 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 90 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 91 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 92 algorithms are not recommended. This class does NOT support direct solver operations. 93 94 Level: beginner 95 96 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 97 M*/ 98 99 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 100 { 101 PetscErrorCode ierr; 102 PetscInt n = A->rmap->n; 103 104 PetscFunctionBegin; 105 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 106 (*B)->factortype = ftype; 107 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 108 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 109 110 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 111 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 112 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 113 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 114 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 115 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 116 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 117 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 118 119 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 120 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 121 PetscFunctionReturn(0); 122 } 123 124 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 125 { 126 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 127 128 PetscFunctionBegin; 129 switch (op) { 130 case MAT_CUSPARSE_MULT: 131 cusparsestruct->format = format; 132 break; 133 case MAT_CUSPARSE_ALL: 134 cusparsestruct->format = format; 135 break; 136 default: 137 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 138 } 139 PetscFunctionReturn(0); 140 } 141 142 /*@ 143 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 144 operation. Only the MatMult operation can use different GPU storage formats 145 for MPIAIJCUSPARSE matrices. 146 Not Collective 147 148 Input Parameters: 149 + A - Matrix of type SEQAIJCUSPARSE 150 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 151 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 152 153 Output Parameter: 154 155 Level: intermediate 156 157 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 158 @*/ 159 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 160 { 161 PetscErrorCode ierr; 162 163 PetscFunctionBegin; 164 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 165 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 166 PetscFunctionReturn(0); 167 } 168 169 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 170 { 171 PetscErrorCode ierr; 172 MatCUSPARSEStorageFormat format; 173 PetscBool flg; 174 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 175 176 PetscFunctionBegin; 177 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 178 if (A->factortype==MAT_FACTOR_NONE) { 179 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 180 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 181 if (flg) { 182 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 183 } 184 } 185 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 186 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 187 if (flg) { 188 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 189 } 190 ierr = PetscOptionsTail();CHKERRQ(ierr); 191 PetscFunctionReturn(0); 192 193 } 194 195 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 196 { 197 PetscErrorCode ierr; 198 199 PetscFunctionBegin; 200 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 201 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 202 PetscFunctionReturn(0); 203 } 204 205 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 206 { 207 PetscErrorCode ierr; 208 209 PetscFunctionBegin; 210 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 211 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 212 PetscFunctionReturn(0); 213 } 214 215 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 216 { 217 PetscErrorCode ierr; 218 219 PetscFunctionBegin; 220 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 221 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 222 PetscFunctionReturn(0); 223 } 224 225 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 226 { 227 PetscErrorCode ierr; 228 229 PetscFunctionBegin; 230 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 231 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 232 PetscFunctionReturn(0); 233 } 234 235 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 236 { 237 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 238 PetscInt n = A->rmap->n; 239 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 240 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 241 cusparseStatus_t stat; 242 const PetscInt *ai = a->i,*aj = a->j,*vi; 243 const MatScalar *aa = a->a,*v; 244 PetscInt *AiLo, *AjLo; 245 PetscScalar *AALo; 246 PetscInt i,nz, nzLower, offset, rowOffset; 247 PetscErrorCode ierr; 248 249 PetscFunctionBegin; 250 if (!n) PetscFunctionReturn(0); 251 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 252 try { 253 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 254 nzLower=n+ai[n]-ai[1]; 255 256 /* Allocate Space for the lower triangular matrix */ 257 ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 258 ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr); 259 ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr); 260 261 /* Fill the lower triangular matrix */ 262 AiLo[0] = (PetscInt) 0; 263 AiLo[n] = nzLower; 264 AjLo[0] = (PetscInt) 0; 265 AALo[0] = (MatScalar) 1.0; 266 v = aa; 267 vi = aj; 268 offset = 1; 269 rowOffset= 1; 270 for (i=1; i<n; i++) { 271 nz = ai[i+1] - ai[i]; 272 /* additional 1 for the term on the diagonal */ 273 AiLo[i] = rowOffset; 274 rowOffset += nz+1; 275 276 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 277 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 278 279 offset += nz; 280 AjLo[offset] = (PetscInt) i; 281 AALo[offset] = (MatScalar) 1.0; 282 offset += 1; 283 284 v += nz; 285 vi += nz; 286 } 287 288 /* allocate space for the triangular factor information */ 289 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 290 291 /* Create the matrix description */ 292 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 293 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 294 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 295 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat); 296 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 297 298 /* Create the solve analysis information */ 299 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 300 301 /* set the operation */ 302 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 303 304 /* set the matrix */ 305 loTriFactor->csrMat = new CsrMatrix; 306 loTriFactor->csrMat->num_rows = n; 307 loTriFactor->csrMat->num_cols = n; 308 loTriFactor->csrMat->num_entries = nzLower; 309 310 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 311 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 312 313 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 314 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 315 316 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 317 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 318 319 /* perform the solve analysis */ 320 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 321 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 322 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 323 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 324 325 /* assign the pointer. Is this really necessary? */ 326 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 327 328 ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr); 329 ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr); 330 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 331 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 332 } catch(char *ex) { 333 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 334 } 335 } 336 PetscFunctionReturn(0); 337 } 338 339 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 340 { 341 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 342 PetscInt n = A->rmap->n; 343 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 344 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 345 cusparseStatus_t stat; 346 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 347 const MatScalar *aa = a->a,*v; 348 PetscInt *AiUp, *AjUp; 349 PetscScalar *AAUp; 350 PetscInt i,nz, nzUpper, offset; 351 PetscErrorCode ierr; 352 353 PetscFunctionBegin; 354 if (!n) PetscFunctionReturn(0); 355 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 356 try { 357 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 358 nzUpper = adiag[0]-adiag[n]; 359 360 /* Allocate Space for the upper triangular matrix */ 361 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 362 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 363 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 364 365 /* Fill the upper triangular matrix */ 366 AiUp[0]=(PetscInt) 0; 367 AiUp[n]=nzUpper; 368 offset = nzUpper; 369 for (i=n-1; i>=0; i--) { 370 v = aa + adiag[i+1] + 1; 371 vi = aj + adiag[i+1] + 1; 372 373 /* number of elements NOT on the diagonal */ 374 nz = adiag[i] - adiag[i+1]-1; 375 376 /* decrement the offset */ 377 offset -= (nz+1); 378 379 /* first, set the diagonal elements */ 380 AjUp[offset] = (PetscInt) i; 381 AAUp[offset] = (MatScalar)1./v[nz]; 382 AiUp[i] = AiUp[i+1] - (nz+1); 383 384 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 385 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 386 } 387 388 /* allocate space for the triangular factor information */ 389 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 390 391 /* Create the matrix description */ 392 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 393 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 394 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 395 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 396 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 397 398 /* Create the solve analysis information */ 399 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 400 401 /* set the operation */ 402 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 403 404 /* set the matrix */ 405 upTriFactor->csrMat = new CsrMatrix; 406 upTriFactor->csrMat->num_rows = n; 407 upTriFactor->csrMat->num_cols = n; 408 upTriFactor->csrMat->num_entries = nzUpper; 409 410 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 411 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 412 413 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 414 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 415 416 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 417 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 418 419 /* perform the solve analysis */ 420 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 421 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 422 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 423 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 424 425 /* assign the pointer. Is this really necessary? */ 426 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 427 428 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 429 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 430 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 431 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 432 } catch(char *ex) { 433 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 434 } 435 } 436 PetscFunctionReturn(0); 437 } 438 439 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 440 { 441 PetscErrorCode ierr; 442 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 443 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 444 IS isrow = a->row,iscol = a->icol; 445 PetscBool row_identity,col_identity; 446 const PetscInt *r,*c; 447 PetscInt n = A->rmap->n; 448 449 PetscFunctionBegin; 450 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 451 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 452 453 cusparseTriFactors->workVector = new THRUSTARRAY(n); 454 cusparseTriFactors->nnz=a->nz; 455 456 A->offloadmask = PETSC_OFFLOAD_BOTH; 457 /* lower triangular indices */ 458 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 459 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 460 if (!row_identity) { 461 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 462 cusparseTriFactors->rpermIndices->assign(r, r+n); 463 } 464 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 465 466 /* upper triangular indices */ 467 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 468 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 469 if (!col_identity) { 470 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 471 cusparseTriFactors->cpermIndices->assign(c, c+n); 472 } 473 474 if (!row_identity && !col_identity) { 475 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 476 } else if(!row_identity) { 477 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 478 } else if(!col_identity) { 479 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 480 } 481 482 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 483 PetscFunctionReturn(0); 484 } 485 486 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 487 { 488 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 489 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 490 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 491 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 492 cusparseStatus_t stat; 493 PetscErrorCode ierr; 494 PetscInt *AiUp, *AjUp; 495 PetscScalar *AAUp; 496 PetscScalar *AALo; 497 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 498 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 499 const PetscInt *ai = b->i,*aj = b->j,*vj; 500 const MatScalar *aa = b->a,*v; 501 502 PetscFunctionBegin; 503 if (!n) PetscFunctionReturn(0); 504 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 505 try { 506 /* Allocate Space for the upper triangular matrix */ 507 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 508 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 509 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 510 ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 511 512 /* Fill the upper triangular matrix */ 513 AiUp[0]=(PetscInt) 0; 514 AiUp[n]=nzUpper; 515 offset = 0; 516 for (i=0; i<n; i++) { 517 /* set the pointers */ 518 v = aa + ai[i]; 519 vj = aj + ai[i]; 520 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 521 522 /* first, set the diagonal elements */ 523 AjUp[offset] = (PetscInt) i; 524 AAUp[offset] = (MatScalar)1.0/v[nz]; 525 AiUp[i] = offset; 526 AALo[offset] = (MatScalar)1.0/v[nz]; 527 528 offset+=1; 529 if (nz>0) { 530 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 531 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 532 for (j=offset; j<offset+nz; j++) { 533 AAUp[j] = -AAUp[j]; 534 AALo[j] = AAUp[j]/v[nz]; 535 } 536 offset+=nz; 537 } 538 } 539 540 /* allocate space for the triangular factor information */ 541 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 542 543 /* Create the matrix description */ 544 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 545 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 546 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 547 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 548 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 549 550 /* Create the solve analysis information */ 551 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 552 553 /* set the operation */ 554 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 555 556 /* set the matrix */ 557 upTriFactor->csrMat = new CsrMatrix; 558 upTriFactor->csrMat->num_rows = A->rmap->n; 559 upTriFactor->csrMat->num_cols = A->cmap->n; 560 upTriFactor->csrMat->num_entries = a->nz; 561 562 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 563 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 564 565 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 566 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 567 568 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 569 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 570 571 /* perform the solve analysis */ 572 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 573 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 574 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 575 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 576 577 /* assign the pointer. Is this really necessary? */ 578 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 579 580 /* allocate space for the triangular factor information */ 581 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 582 583 /* Create the matrix description */ 584 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 585 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 586 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 587 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 588 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 589 590 /* Create the solve analysis information */ 591 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 592 593 /* set the operation */ 594 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 595 596 /* set the matrix */ 597 loTriFactor->csrMat = new CsrMatrix; 598 loTriFactor->csrMat->num_rows = A->rmap->n; 599 loTriFactor->csrMat->num_cols = A->cmap->n; 600 loTriFactor->csrMat->num_entries = a->nz; 601 602 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 603 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 604 605 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 606 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 607 608 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 609 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 610 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 611 612 /* perform the solve analysis */ 613 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 614 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 615 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 616 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 617 618 /* assign the pointer. Is this really necessary? */ 619 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 620 621 A->offloadmask = PETSC_OFFLOAD_BOTH; 622 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 623 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 624 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 625 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 626 } catch(char *ex) { 627 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 628 } 629 } 630 PetscFunctionReturn(0); 631 } 632 633 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 634 { 635 PetscErrorCode ierr; 636 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 637 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 638 IS ip = a->row; 639 const PetscInt *rip; 640 PetscBool perm_identity; 641 PetscInt n = A->rmap->n; 642 643 PetscFunctionBegin; 644 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 645 cusparseTriFactors->workVector = new THRUSTARRAY(n); 646 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 647 648 /* lower triangular indices */ 649 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 650 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 651 if (!perm_identity) { 652 IS iip; 653 const PetscInt *irip; 654 655 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 656 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 657 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 658 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 659 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 660 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 661 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 662 ierr = ISDestroy(&iip);CHKERRQ(ierr); 663 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 664 } 665 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 666 PetscFunctionReturn(0); 667 } 668 669 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 670 { 671 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 672 IS isrow = b->row,iscol = b->col; 673 PetscBool row_identity,col_identity; 674 PetscErrorCode ierr; 675 676 PetscFunctionBegin; 677 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 678 /* determine which version of MatSolve needs to be used. */ 679 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 680 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 681 if (row_identity && col_identity) { 682 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 683 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 684 B->ops->matsolve = NULL; 685 B->ops->matsolvetranspose = NULL; 686 } else { 687 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 688 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 689 B->ops->matsolve = NULL; 690 B->ops->matsolvetranspose = NULL; 691 } 692 693 /* get the triangular factors */ 694 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 695 PetscFunctionReturn(0); 696 } 697 698 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 699 { 700 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 701 IS ip = b->row; 702 PetscBool perm_identity; 703 PetscErrorCode ierr; 704 705 PetscFunctionBegin; 706 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 707 708 /* determine which version of MatSolve needs to be used. */ 709 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 710 if (perm_identity) { 711 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 712 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 713 B->ops->matsolve = NULL; 714 B->ops->matsolvetranspose = NULL; 715 } else { 716 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 717 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 718 B->ops->matsolve = NULL; 719 B->ops->matsolvetranspose = NULL; 720 } 721 722 /* get the triangular factors */ 723 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 724 PetscFunctionReturn(0); 725 } 726 727 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 728 { 729 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 730 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 731 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 732 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 733 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 734 cusparseStatus_t stat; 735 cusparseIndexBase_t indexBase; 736 cusparseMatrixType_t matrixType; 737 cusparseFillMode_t fillMode; 738 cusparseDiagType_t diagType; 739 740 PetscFunctionBegin; 741 742 /*********************************************/ 743 /* Now the Transpose of the Lower Tri Factor */ 744 /*********************************************/ 745 746 /* allocate space for the transpose of the lower triangular factor */ 747 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 748 749 /* set the matrix descriptors of the lower triangular factor */ 750 matrixType = cusparseGetMatType(loTriFactor->descr); 751 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 752 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 753 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 754 diagType = cusparseGetMatDiagType(loTriFactor->descr); 755 756 /* Create the matrix description */ 757 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat); 758 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat); 759 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat); 760 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat); 761 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat); 762 763 /* Create the solve analysis information */ 764 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat); 765 766 /* set the operation */ 767 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 768 769 /* allocate GPU space for the CSC of the lower triangular factor*/ 770 loTriFactorT->csrMat = new CsrMatrix; 771 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 772 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 773 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 774 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 775 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 776 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 777 778 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 779 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 780 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 781 loTriFactor->csrMat->values->data().get(), 782 loTriFactor->csrMat->row_offsets->data().get(), 783 loTriFactor->csrMat->column_indices->data().get(), 784 loTriFactorT->csrMat->values->data().get(), 785 loTriFactorT->csrMat->column_indices->data().get(), 786 loTriFactorT->csrMat->row_offsets->data().get(), 787 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 788 789 /* perform the solve analysis on the transposed matrix */ 790 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 791 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 792 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 793 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 794 loTriFactorT->solveInfo);CHKERRCUDA(stat); 795 796 /* assign the pointer. Is this really necessary? */ 797 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 798 799 /*********************************************/ 800 /* Now the Transpose of the Upper Tri Factor */ 801 /*********************************************/ 802 803 /* allocate space for the transpose of the upper triangular factor */ 804 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 805 806 /* set the matrix descriptors of the upper triangular factor */ 807 matrixType = cusparseGetMatType(upTriFactor->descr); 808 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 809 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 810 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 811 diagType = cusparseGetMatDiagType(upTriFactor->descr); 812 813 /* Create the matrix description */ 814 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat); 815 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat); 816 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat); 817 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat); 818 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat); 819 820 /* Create the solve analysis information */ 821 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat); 822 823 /* set the operation */ 824 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 825 826 /* allocate GPU space for the CSC of the upper triangular factor*/ 827 upTriFactorT->csrMat = new CsrMatrix; 828 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 829 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 830 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 831 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 832 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 833 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 834 835 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 836 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 837 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 838 upTriFactor->csrMat->values->data().get(), 839 upTriFactor->csrMat->row_offsets->data().get(), 840 upTriFactor->csrMat->column_indices->data().get(), 841 upTriFactorT->csrMat->values->data().get(), 842 upTriFactorT->csrMat->column_indices->data().get(), 843 upTriFactorT->csrMat->row_offsets->data().get(), 844 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 845 846 /* perform the solve analysis on the transposed matrix */ 847 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 848 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 849 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 850 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 851 upTriFactorT->solveInfo);CHKERRCUDA(stat); 852 853 /* assign the pointer. Is this really necessary? */ 854 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 855 PetscFunctionReturn(0); 856 } 857 858 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 859 { 860 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 861 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 862 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 863 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 864 cusparseStatus_t stat; 865 cusparseIndexBase_t indexBase; 866 cudaError_t err; 867 PetscErrorCode ierr; 868 869 PetscFunctionBegin; 870 871 /* allocate space for the triangular factor information */ 872 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 873 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat); 874 indexBase = cusparseGetMatIndexBase(matstruct->descr); 875 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat); 876 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 877 878 /* set alpha and beta */ 879 err = cudaMalloc((void **)&(matstructT->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 880 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 881 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 882 err = cudaMemcpy(matstructT->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 883 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 884 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 885 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 886 887 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 888 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 889 CsrMatrix *matrixT= new CsrMatrix; 890 matrixT->num_rows = A->cmap->n; 891 matrixT->num_cols = A->rmap->n; 892 matrixT->num_entries = a->nz; 893 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 894 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 895 matrixT->values = new THRUSTARRAY(a->nz); 896 897 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 898 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 899 /* compute the transpose, i.e. the CSC */ 900 indexBase = cusparseGetMatIndexBase(matstruct->descr); 901 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 902 A->cmap->n, matrix->num_entries, 903 matrix->values->data().get(), 904 cusparsestruct->rowoffsets_gpu->data().get(), 905 matrix->column_indices->data().get(), 906 matrixT->values->data().get(), 907 matrixT->column_indices->data().get(), 908 matrixT->row_offsets->data().get(), 909 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 910 /* assign the pointer */ 911 matstructT->mat = matrixT; 912 ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 913 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 914 /* First convert HYB to CSR */ 915 CsrMatrix *temp= new CsrMatrix; 916 temp->num_rows = A->rmap->n; 917 temp->num_cols = A->cmap->n; 918 temp->num_entries = a->nz; 919 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 920 temp->column_indices = new THRUSTINTARRAY32(a->nz); 921 temp->values = new THRUSTARRAY(a->nz); 922 923 924 stat = cusparse_hyb2csr(cusparsestruct->handle, 925 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 926 temp->values->data().get(), 927 temp->row_offsets->data().get(), 928 temp->column_indices->data().get());CHKERRCUDA(stat); 929 930 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 931 CsrMatrix *tempT= new CsrMatrix; 932 tempT->num_rows = A->rmap->n; 933 tempT->num_cols = A->cmap->n; 934 tempT->num_entries = a->nz; 935 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 936 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 937 tempT->values = new THRUSTARRAY(a->nz); 938 939 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 940 temp->num_cols, temp->num_entries, 941 temp->values->data().get(), 942 temp->row_offsets->data().get(), 943 temp->column_indices->data().get(), 944 tempT->values->data().get(), 945 tempT->column_indices->data().get(), 946 tempT->row_offsets->data().get(), 947 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 948 949 /* Last, convert CSC to HYB */ 950 cusparseHybMat_t hybMat; 951 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 952 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 953 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 954 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 955 matstructT->descr, tempT->values->data().get(), 956 tempT->row_offsets->data().get(), 957 tempT->column_indices->data().get(), 958 hybMat, 0, partition);CHKERRCUDA(stat); 959 960 /* assign the pointer */ 961 matstructT->mat = hybMat; 962 ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr); 963 964 /* delete temporaries */ 965 if (tempT) { 966 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 967 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 968 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 969 delete (CsrMatrix*) tempT; 970 } 971 if (temp) { 972 if (temp->values) delete (THRUSTARRAY*) temp->values; 973 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 974 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 975 delete (CsrMatrix*) temp; 976 } 977 } 978 /* assign the compressed row indices */ 979 matstructT->cprowIndices = new THRUSTINTARRAY; 980 matstructT->cprowIndices->resize(A->cmap->n); 981 thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end()); 982 /* assign the pointer */ 983 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 984 PetscFunctionReturn(0); 985 } 986 987 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 988 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 989 { 990 PetscInt n = xx->map->n; 991 const PetscScalar *barray; 992 PetscScalar *xarray; 993 thrust::device_ptr<const PetscScalar> bGPU; 994 thrust::device_ptr<PetscScalar> xGPU; 995 cusparseStatus_t stat; 996 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 997 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 998 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 999 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1000 PetscErrorCode ierr; 1001 1002 PetscFunctionBegin; 1003 /* Analyze the matrix and create the transpose ... on the fly */ 1004 if (!loTriFactorT && !upTriFactorT) { 1005 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1006 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1007 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1008 } 1009 1010 /* Get the GPU pointers */ 1011 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1012 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1013 xGPU = thrust::device_pointer_cast(xarray); 1014 bGPU = thrust::device_pointer_cast(barray); 1015 1016 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1017 /* First, reorder with the row permutation */ 1018 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1019 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1020 xGPU); 1021 1022 /* First, solve U */ 1023 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1024 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1025 upTriFactorT->csrMat->values->data().get(), 1026 upTriFactorT->csrMat->row_offsets->data().get(), 1027 upTriFactorT->csrMat->column_indices->data().get(), 1028 upTriFactorT->solveInfo, 1029 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1030 1031 /* Then, solve L */ 1032 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1033 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1034 loTriFactorT->csrMat->values->data().get(), 1035 loTriFactorT->csrMat->row_offsets->data().get(), 1036 loTriFactorT->csrMat->column_indices->data().get(), 1037 loTriFactorT->solveInfo, 1038 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1039 1040 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1041 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1042 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1043 tempGPU->begin()); 1044 1045 /* Copy the temporary to the full solution. */ 1046 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1047 1048 /* restore */ 1049 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1050 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1051 ierr = WaitForGPU();CHKERRCUDA(ierr); 1052 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1053 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1054 PetscFunctionReturn(0); 1055 } 1056 1057 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1058 { 1059 const PetscScalar *barray; 1060 PetscScalar *xarray; 1061 cusparseStatus_t stat; 1062 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1063 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1064 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1065 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1066 PetscErrorCode ierr; 1067 1068 PetscFunctionBegin; 1069 /* Analyze the matrix and create the transpose ... on the fly */ 1070 if (!loTriFactorT && !upTriFactorT) { 1071 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1072 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1073 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1074 } 1075 1076 /* Get the GPU pointers */ 1077 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1078 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1079 1080 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1081 /* First, solve U */ 1082 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1083 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1084 upTriFactorT->csrMat->values->data().get(), 1085 upTriFactorT->csrMat->row_offsets->data().get(), 1086 upTriFactorT->csrMat->column_indices->data().get(), 1087 upTriFactorT->solveInfo, 1088 barray, tempGPU->data().get());CHKERRCUDA(stat); 1089 1090 /* Then, solve L */ 1091 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1092 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1093 loTriFactorT->csrMat->values->data().get(), 1094 loTriFactorT->csrMat->row_offsets->data().get(), 1095 loTriFactorT->csrMat->column_indices->data().get(), 1096 loTriFactorT->solveInfo, 1097 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1098 1099 /* restore */ 1100 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1101 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1102 ierr = WaitForGPU();CHKERRCUDA(ierr); 1103 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1104 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1105 PetscFunctionReturn(0); 1106 } 1107 1108 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1109 { 1110 const PetscScalar *barray; 1111 PetscScalar *xarray; 1112 thrust::device_ptr<const PetscScalar> bGPU; 1113 thrust::device_ptr<PetscScalar> xGPU; 1114 cusparseStatus_t stat; 1115 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1116 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1117 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1118 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1119 PetscErrorCode ierr; 1120 1121 PetscFunctionBegin; 1122 1123 /* Get the GPU pointers */ 1124 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1125 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1126 xGPU = thrust::device_pointer_cast(xarray); 1127 bGPU = thrust::device_pointer_cast(barray); 1128 1129 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1130 /* First, reorder with the row permutation */ 1131 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1132 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1133 tempGPU->begin()); 1134 1135 /* Next, solve L */ 1136 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1137 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1138 loTriFactor->csrMat->values->data().get(), 1139 loTriFactor->csrMat->row_offsets->data().get(), 1140 loTriFactor->csrMat->column_indices->data().get(), 1141 loTriFactor->solveInfo, 1142 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1143 1144 /* Then, solve U */ 1145 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1146 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1147 upTriFactor->csrMat->values->data().get(), 1148 upTriFactor->csrMat->row_offsets->data().get(), 1149 upTriFactor->csrMat->column_indices->data().get(), 1150 upTriFactor->solveInfo, 1151 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1152 1153 /* Last, reorder with the column permutation */ 1154 thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1155 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1156 xGPU); 1157 1158 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1159 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1160 ierr = WaitForGPU();CHKERRCUDA(ierr); 1161 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1162 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1163 PetscFunctionReturn(0); 1164 } 1165 1166 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1167 { 1168 const PetscScalar *barray; 1169 PetscScalar *xarray; 1170 cusparseStatus_t stat; 1171 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1172 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1173 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1174 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1175 PetscErrorCode ierr; 1176 1177 PetscFunctionBegin; 1178 /* Get the GPU pointers */ 1179 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1180 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1181 1182 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1183 /* First, solve L */ 1184 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1185 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1186 loTriFactor->csrMat->values->data().get(), 1187 loTriFactor->csrMat->row_offsets->data().get(), 1188 loTriFactor->csrMat->column_indices->data().get(), 1189 loTriFactor->solveInfo, 1190 barray, tempGPU->data().get());CHKERRCUDA(stat); 1191 1192 /* Next, solve U */ 1193 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1194 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1195 upTriFactor->csrMat->values->data().get(), 1196 upTriFactor->csrMat->row_offsets->data().get(), 1197 upTriFactor->csrMat->column_indices->data().get(), 1198 upTriFactor->solveInfo, 1199 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1200 1201 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1202 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1203 ierr = WaitForGPU();CHKERRCUDA(ierr); 1204 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1205 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1206 PetscFunctionReturn(0); 1207 } 1208 1209 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1210 { 1211 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1212 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1213 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1214 PetscInt m = A->rmap->n,*ii,*ridx; 1215 PetscErrorCode ierr; 1216 cusparseStatus_t stat; 1217 cudaError_t err; 1218 1219 PetscFunctionBegin; 1220 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1221 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1222 if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1223 /* Copy values only */ 1224 CsrMatrix *mat,*matT; 1225 mat = (CsrMatrix*)cusparsestruct->mat->mat; 1226 mat->values->assign(a->a, a->a+a->nz); 1227 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1228 1229 /* Update matT when it was built before */ 1230 if (cusparsestruct->matTranspose) { 1231 cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr); 1232 matT = (CsrMatrix*)cusparsestruct->matTranspose->mat; 1233 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1234 A->cmap->n, mat->num_entries, 1235 mat->values->data().get(), 1236 cusparsestruct->rowoffsets_gpu->data().get(), 1237 mat->column_indices->data().get(), 1238 matT->values->data().get(), 1239 matT->column_indices->data().get(), 1240 matT->row_offsets->data().get(), 1241 CUSPARSE_ACTION_NUMERIC,indexBase);CHKERRCUDA(stat); 1242 } 1243 } else { 1244 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1245 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr); 1246 delete cusparsestruct->workVector; 1247 delete cusparsestruct->rowoffsets_gpu; 1248 try { 1249 cusparsestruct->nonzerorow=0; 1250 for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0); 1251 1252 if (a->compressedrow.use) { 1253 m = a->compressedrow.nrows; 1254 ii = a->compressedrow.i; 1255 ridx = a->compressedrow.rindex; 1256 } else { 1257 /* Forcing compressed row on the GPU */ 1258 int k=0; 1259 ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr); 1260 ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr); 1261 ii[0]=0; 1262 for (int j = 0; j<m; j++) { 1263 if ((a->i[j+1]-a->i[j])>0) { 1264 ii[k] = a->i[j]; 1265 ridx[k]= j; 1266 k++; 1267 } 1268 } 1269 ii[cusparsestruct->nonzerorow] = a->nz; 1270 m = cusparsestruct->nonzerorow; 1271 } 1272 1273 /* allocate space for the triangular factor information */ 1274 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1275 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat); 1276 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 1277 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 1278 1279 err = cudaMalloc((void **)&(matstruct->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 1280 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1281 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1282 err = cudaMemcpy(matstruct->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1283 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1284 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1285 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 1286 1287 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1288 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1289 /* set the matrix */ 1290 CsrMatrix *matrix= new CsrMatrix; 1291 matrix->num_rows = m; 1292 matrix->num_cols = A->cmap->n; 1293 matrix->num_entries = a->nz; 1294 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1295 matrix->row_offsets->assign(ii, ii + m+1); 1296 1297 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1298 matrix->column_indices->assign(a->j, a->j+a->nz); 1299 1300 matrix->values = new THRUSTARRAY(a->nz); 1301 matrix->values->assign(a->a, a->a+a->nz); 1302 1303 /* assign the pointer */ 1304 matstruct->mat = matrix; 1305 1306 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1307 CsrMatrix *matrix= new CsrMatrix; 1308 matrix->num_rows = m; 1309 matrix->num_cols = A->cmap->n; 1310 matrix->num_entries = a->nz; 1311 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1312 matrix->row_offsets->assign(ii, ii + m+1); 1313 1314 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1315 matrix->column_indices->assign(a->j, a->j+a->nz); 1316 1317 matrix->values = new THRUSTARRAY(a->nz); 1318 matrix->values->assign(a->a, a->a+a->nz); 1319 1320 cusparseHybMat_t hybMat; 1321 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 1322 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1323 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1324 stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1325 matstruct->descr, matrix->values->data().get(), 1326 matrix->row_offsets->data().get(), 1327 matrix->column_indices->data().get(), 1328 hybMat, 0, partition);CHKERRCUDA(stat); 1329 /* assign the pointer */ 1330 matstruct->mat = hybMat; 1331 1332 if (matrix) { 1333 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1334 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1335 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1336 delete (CsrMatrix*)matrix; 1337 } 1338 } 1339 1340 /* assign the compressed row indices */ 1341 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1342 matstruct->cprowIndices->assign(ridx,ridx+m); 1343 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+m*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1344 1345 /* assign the pointer */ 1346 cusparsestruct->mat = matstruct; 1347 1348 if (!a->compressedrow.use) { 1349 ierr = PetscFree(ii);CHKERRQ(ierr); 1350 ierr = PetscFree(ridx);CHKERRQ(ierr); 1351 } 1352 cusparsestruct->workVector = new THRUSTARRAY(m); 1353 } catch(char *ex) { 1354 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1355 } 1356 cusparsestruct->nonzerostate = A->nonzerostate; 1357 } 1358 ierr = WaitForGPU();CHKERRCUDA(ierr); 1359 A->offloadmask = PETSC_OFFLOAD_BOTH; 1360 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1361 } 1362 PetscFunctionReturn(0); 1363 } 1364 1365 struct VecCUDAPlusEquals 1366 { 1367 template <typename Tuple> 1368 __host__ __device__ 1369 void operator()(Tuple t) 1370 { 1371 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1372 } 1373 }; 1374 1375 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1376 { 1377 PetscErrorCode ierr; 1378 1379 PetscFunctionBegin; 1380 ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr); 1381 PetscFunctionReturn(0); 1382 } 1383 1384 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1385 { 1386 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1387 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1388 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1389 const PetscScalar *xarray; 1390 PetscScalar *yarray; 1391 PetscErrorCode ierr; 1392 cusparseStatus_t stat; 1393 1394 PetscFunctionBegin; 1395 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1396 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1397 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1398 if (!matstructT) { 1399 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1400 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1401 } 1402 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1403 ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1404 if (yy->map->n) { 1405 PetscInt n = yy->map->n; 1406 cudaError_t err; 1407 err = cudaMemset(yarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err); /* hack to fix numerical errors from reading output vector yy, apparently */ 1408 } 1409 1410 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1411 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1412 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1413 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1414 mat->num_rows, mat->num_cols, 1415 mat->num_entries, matstructT->alpha, matstructT->descr, 1416 mat->values->data().get(), mat->row_offsets->data().get(), 1417 mat->column_indices->data().get(), xarray, matstructT->beta_zero, 1418 yarray);CHKERRCUDA(stat); 1419 } else { 1420 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1421 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1422 matstructT->alpha, matstructT->descr, hybMat, 1423 xarray, matstructT->beta_zero, 1424 yarray);CHKERRCUDA(stat); 1425 } 1426 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1427 ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1428 ierr = WaitForGPU();CHKERRCUDA(ierr); 1429 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1430 ierr = PetscLogGpuFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1431 PetscFunctionReturn(0); 1432 } 1433 1434 1435 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1436 { 1437 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1438 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1439 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 1440 const PetscScalar *xarray; 1441 PetscScalar *zarray,*dptr,*beta; 1442 PetscErrorCode ierr; 1443 cusparseStatus_t stat; 1444 PetscBool cmpr; /* if the matrix has been compressed (zero rows) */ 1445 1446 PetscFunctionBegin; 1447 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1448 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1449 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1450 try { 1451 cmpr = (PetscBool)(cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n)); 1452 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1453 if (yy && !cmpr) { /* MatMultAdd with noncompressed storage -> need uptodate zz vector */ 1454 ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr); 1455 } else { 1456 ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1457 } 1458 dptr = cmpr ? zarray : cusparsestruct->workVector->data().get(); 1459 beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero; 1460 1461 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1462 /* csr_spmv is multiply add */ 1463 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1464 /* here we need to be careful to set the number of rows in the multiply to the 1465 number of compressed rows in the matrix ... which is equivalent to the 1466 size of the workVector */ 1467 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1468 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1469 mat->num_rows, mat->num_cols, 1470 mat->num_entries, matstruct->alpha, matstruct->descr, 1471 mat->values->data().get(), mat->row_offsets->data().get(), 1472 mat->column_indices->data().get(), xarray, beta, 1473 dptr);CHKERRCUDA(stat); 1474 } else { 1475 if (cusparsestruct->workVector->size()) { 1476 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1477 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1478 matstruct->alpha, matstruct->descr, hybMat, 1479 xarray, beta, 1480 dptr);CHKERRCUDA(stat); 1481 } 1482 } 1483 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1484 1485 if (yy) { 1486 if (dptr != zarray) { 1487 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1488 } else if (zz != yy) { 1489 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); 1490 } 1491 } else if (dptr != zarray) { 1492 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 1493 } 1494 /* scatter the data from the temporary into the full vector with a += operation */ 1495 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1496 if (dptr != zarray) { 1497 thrust::device_ptr<PetscScalar> zptr; 1498 1499 zptr = thrust::device_pointer_cast(zarray); 1500 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 1501 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1502 VecCUDAPlusEquals()); 1503 } 1504 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1505 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1506 if (yy && !cmpr) { 1507 ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr); 1508 } else { 1509 ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1510 } 1511 } catch(char *ex) { 1512 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1513 } 1514 ierr = WaitForGPU();CHKERRCUDA(ierr); 1515 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1516 PetscFunctionReturn(0); 1517 } 1518 1519 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1520 { 1521 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1522 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1523 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1524 const PetscScalar *xarray; 1525 PetscScalar *zarray,*dptr,*beta; 1526 PetscErrorCode ierr; 1527 cusparseStatus_t stat; 1528 1529 PetscFunctionBegin; 1530 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1531 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1532 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1533 if (!matstructT) { 1534 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1535 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1536 } 1537 1538 try { 1539 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1540 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1541 ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr); 1542 dptr = cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->cmap->n) ? zarray : cusparsestruct->workVector->data().get(); 1543 beta = (yy == zz && dptr == zarray) ? matstructT->beta_one : matstructT->beta_zero; 1544 1545 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1546 /* multiply add with matrix transpose */ 1547 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1548 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1549 /* here we need to be careful to set the number of rows in the multiply to the 1550 number of compressed rows in the matrix ... which is equivalent to the 1551 size of the workVector */ 1552 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1553 mat->num_rows, mat->num_cols, 1554 mat->num_entries, matstructT->alpha, matstructT->descr, 1555 mat->values->data().get(), mat->row_offsets->data().get(), 1556 mat->column_indices->data().get(), xarray, beta, 1557 dptr);CHKERRCUDA(stat); 1558 } else { 1559 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1560 if (cusparsestruct->workVector->size()) { 1561 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1562 matstructT->alpha, matstructT->descr, hybMat, 1563 xarray, beta, 1564 dptr);CHKERRCUDA(stat); 1565 } 1566 } 1567 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1568 1569 if (dptr != zarray) { 1570 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1571 } else if (zz != yy) { 1572 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); 1573 } 1574 /* scatter the data from the temporary into the full vector with a += operation */ 1575 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1576 if (dptr != zarray) { 1577 thrust::device_ptr<PetscScalar> zptr; 1578 1579 zptr = thrust::device_pointer_cast(zarray); 1580 1581 /* scatter the data from the temporary into the full vector with a += operation */ 1582 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))), 1583 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n, 1584 VecCUDAPlusEquals()); 1585 } 1586 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1587 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1588 ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr); 1589 } catch(char *ex) { 1590 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1591 } 1592 ierr = WaitForGPU();CHKERRCUDA(ierr); /* is this needed? just for yy==0 in Mult */ 1593 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1594 PetscFunctionReturn(0); 1595 } 1596 1597 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1598 { 1599 PetscErrorCode ierr; 1600 1601 PetscFunctionBegin; 1602 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1603 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1604 if (A->factortype == MAT_FACTOR_NONE) { 1605 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1606 } 1607 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1608 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1609 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1610 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1611 PetscFunctionReturn(0); 1612 } 1613 1614 /* --------------------------------------------------------------------------------*/ 1615 /*@ 1616 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1617 (the default parallel PETSc format). This matrix will ultimately pushed down 1618 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1619 assembly performance the user should preallocate the matrix storage by setting 1620 the parameter nz (or the array nnz). By setting these parameters accurately, 1621 performance during matrix assembly can be increased by more than a factor of 50. 1622 1623 Collective 1624 1625 Input Parameters: 1626 + comm - MPI communicator, set to PETSC_COMM_SELF 1627 . m - number of rows 1628 . n - number of columns 1629 . nz - number of nonzeros per row (same for all rows) 1630 - nnz - array containing the number of nonzeros in the various rows 1631 (possibly different for each row) or NULL 1632 1633 Output Parameter: 1634 . A - the matrix 1635 1636 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1637 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1638 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1639 1640 Notes: 1641 If nnz is given then nz is ignored 1642 1643 The AIJ format (also called the Yale sparse matrix format or 1644 compressed row storage), is fully compatible with standard Fortran 77 1645 storage. That is, the stored row and column indices can begin at 1646 either one (as in Fortran) or zero. See the users' manual for details. 1647 1648 Specify the preallocated storage with either nz or nnz (not both). 1649 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1650 allocation. For large problems you MUST preallocate memory or you 1651 will get TERRIBLE performance, see the users' manual chapter on matrices. 1652 1653 By default, this format uses inodes (identical nodes) when possible, to 1654 improve numerical efficiency of matrix-vector products and solves. We 1655 search for consecutive rows with the same nonzero structure, thereby 1656 reusing matrix information to achieve increased efficiency. 1657 1658 Level: intermediate 1659 1660 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1661 @*/ 1662 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1663 { 1664 PetscErrorCode ierr; 1665 1666 PetscFunctionBegin; 1667 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1668 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1669 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1670 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1671 PetscFunctionReturn(0); 1672 } 1673 1674 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1675 { 1676 PetscErrorCode ierr; 1677 1678 PetscFunctionBegin; 1679 if (A->factortype==MAT_FACTOR_NONE) { 1680 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) { 1681 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 1682 } 1683 } else { 1684 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 1685 } 1686 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1687 PetscFunctionReturn(0); 1688 } 1689 1690 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 1691 { 1692 PetscErrorCode ierr; 1693 Mat C; 1694 cusparseStatus_t stat; 1695 cusparseHandle_t handle=0; 1696 1697 PetscFunctionBegin; 1698 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 1699 C = *B; 1700 ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr); 1701 ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr); 1702 1703 /* inject CUSPARSE-specific stuff */ 1704 if (C->factortype==MAT_FACTOR_NONE) { 1705 /* you cannot check the inode.use flag here since the matrix was just created. 1706 now build a GPU matrix data structure */ 1707 C->spptr = new Mat_SeqAIJCUSPARSE; 1708 ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat = 0; 1709 ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0; 1710 ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector = 0; 1711 ((Mat_SeqAIJCUSPARSE*)C->spptr)->rowoffsets_gpu = 0; 1712 ((Mat_SeqAIJCUSPARSE*)C->spptr)->format = MAT_CUSPARSE_CSR; 1713 ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream = 0; 1714 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1715 ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle = handle; 1716 ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0; 1717 } else { 1718 /* NEXT, set the pointers to the triangular factors */ 1719 C->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1720 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr = 0; 1721 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr = 0; 1722 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0; 1723 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0; 1724 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices = 0; 1725 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices = 0; 1726 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector = 0; 1727 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = 0; 1728 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1729 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = handle; 1730 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz = 0; 1731 } 1732 1733 C->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1734 C->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1735 C->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1736 C->ops->mult = MatMult_SeqAIJCUSPARSE; 1737 C->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1738 C->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1739 C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1740 C->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1741 1742 ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1743 1744 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 1745 1746 ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1747 PetscFunctionReturn(0); 1748 } 1749 1750 PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B) 1751 { 1752 PetscErrorCode ierr; 1753 cusparseStatus_t stat; 1754 cusparseHandle_t handle=0; 1755 1756 PetscFunctionBegin; 1757 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 1758 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 1759 1760 if (B->factortype==MAT_FACTOR_NONE) { 1761 /* you cannot check the inode.use flag here since the matrix was just created. 1762 now build a GPU matrix data structure */ 1763 B->spptr = new Mat_SeqAIJCUSPARSE; 1764 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1765 ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0; 1766 ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0; 1767 ((Mat_SeqAIJCUSPARSE*)B->spptr)->rowoffsets_gpu = 0; 1768 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1769 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1770 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1771 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle; 1772 ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0; 1773 } else { 1774 /* NEXT, set the pointers to the triangular factors */ 1775 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1776 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 1777 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1778 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0; 1779 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0; 1780 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0; 1781 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0; 1782 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0; 1783 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1784 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle; 1785 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0; 1786 } 1787 1788 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1789 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1790 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1791 B->ops->mult = MatMult_SeqAIJCUSPARSE; 1792 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1793 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1794 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1795 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1796 1797 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1798 1799 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 1800 1801 ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1802 PetscFunctionReturn(0); 1803 } 1804 1805 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 1806 { 1807 PetscErrorCode ierr; 1808 1809 PetscFunctionBegin; 1810 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 1811 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);CHKERRQ(ierr); 1812 PetscFunctionReturn(0); 1813 } 1814 1815 /*MC 1816 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1817 1818 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1819 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 1820 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 1821 1822 Options Database Keys: 1823 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1824 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1825 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1826 1827 Level: beginner 1828 1829 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1830 M*/ 1831 1832 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 1833 1834 1835 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 1836 { 1837 PetscErrorCode ierr; 1838 1839 PetscFunctionBegin; 1840 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1841 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1842 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1843 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1844 PetscFunctionReturn(0); 1845 } 1846 1847 1848 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 1849 { 1850 cusparseStatus_t stat; 1851 cusparseHandle_t handle; 1852 1853 PetscFunctionBegin; 1854 if (*cusparsestruct) { 1855 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format); 1856 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format); 1857 delete (*cusparsestruct)->workVector; 1858 delete (*cusparsestruct)->rowoffsets_gpu; 1859 if (handle = (*cusparsestruct)->handle) { 1860 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1861 } 1862 delete *cusparsestruct; 1863 *cusparsestruct = 0; 1864 } 1865 PetscFunctionReturn(0); 1866 } 1867 1868 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 1869 { 1870 PetscFunctionBegin; 1871 if (*mat) { 1872 delete (*mat)->values; 1873 delete (*mat)->column_indices; 1874 delete (*mat)->row_offsets; 1875 delete *mat; 1876 *mat = 0; 1877 } 1878 PetscFunctionReturn(0); 1879 } 1880 1881 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 1882 { 1883 cusparseStatus_t stat; 1884 PetscErrorCode ierr; 1885 1886 PetscFunctionBegin; 1887 if (*trifactor) { 1888 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); } 1889 if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); } 1890 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 1891 delete *trifactor; 1892 *trifactor = 0; 1893 } 1894 PetscFunctionReturn(0); 1895 } 1896 1897 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 1898 { 1899 CsrMatrix *mat; 1900 cusparseStatus_t stat; 1901 cudaError_t err; 1902 1903 PetscFunctionBegin; 1904 if (*matstruct) { 1905 if ((*matstruct)->mat) { 1906 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 1907 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 1908 stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat); 1909 } else { 1910 mat = (CsrMatrix*)(*matstruct)->mat; 1911 CsrMatrix_Destroy(&mat); 1912 } 1913 } 1914 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); } 1915 delete (*matstruct)->cprowIndices; 1916 if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); } 1917 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 1918 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 1919 delete *matstruct; 1920 *matstruct = 0; 1921 } 1922 PetscFunctionReturn(0); 1923 } 1924 1925 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 1926 { 1927 cusparseHandle_t handle; 1928 cusparseStatus_t stat; 1929 1930 PetscFunctionBegin; 1931 if (*trifactors) { 1932 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr); 1933 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr); 1934 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose); 1935 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose); 1936 delete (*trifactors)->rpermIndices; 1937 delete (*trifactors)->cpermIndices; 1938 delete (*trifactors)->workVector; 1939 if (handle = (*trifactors)->handle) { 1940 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1941 } 1942 delete *trifactors; 1943 *trifactors = 0; 1944 } 1945 PetscFunctionReturn(0); 1946 } 1947 1948