1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_CXX_COMPLEX_FIX 7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 8 9 #include <petscconf.h> 10 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11 #include <../src/mat/impls/sbaij/seq/sbaij.h> 12 #include <../src/vec/vec/impls/dvecimpl.h> 13 #include <petsc/private/vecimpl.h> 14 #undef VecType 15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 19 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 20 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 21 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 22 23 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 24 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 25 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 26 27 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 28 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 29 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 30 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 31 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 32 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 33 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 34 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 35 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 36 37 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 38 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 39 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 40 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 41 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 42 43 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 44 { 45 cusparseStatus_t stat; 46 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 47 48 PetscFunctionBegin; 49 cusparsestruct->stream = stream; 50 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 51 PetscFunctionReturn(0); 52 } 53 54 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 55 { 56 cusparseStatus_t stat; 57 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 58 59 PetscFunctionBegin; 60 if (cusparsestruct->handle != handle) { 61 if (cusparsestruct->handle) { 62 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 63 } 64 cusparsestruct->handle = handle; 65 } 66 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 67 PetscFunctionReturn(0); 68 } 69 70 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 71 { 72 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 73 PetscFunctionBegin; 74 if (cusparsestruct->handle) 75 cusparsestruct->handle = 0; 76 PetscFunctionReturn(0); 77 } 78 79 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 80 { 81 PetscFunctionBegin; 82 *type = MATSOLVERCUSPARSE; 83 PetscFunctionReturn(0); 84 } 85 86 /*MC 87 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 88 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 89 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 90 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 91 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 92 algorithms are not recommended. This class does NOT support direct solver operations. 93 94 Level: beginner 95 96 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 97 M*/ 98 99 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 100 { 101 PetscErrorCode ierr; 102 PetscInt n = A->rmap->n; 103 104 PetscFunctionBegin; 105 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 106 (*B)->factortype = ftype; 107 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 108 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 109 110 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 111 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 112 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 113 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 114 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 115 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 116 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 117 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 118 119 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 120 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 121 PetscFunctionReturn(0); 122 } 123 124 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 125 { 126 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 127 128 PetscFunctionBegin; 129 switch (op) { 130 case MAT_CUSPARSE_MULT: 131 cusparsestruct->format = format; 132 break; 133 case MAT_CUSPARSE_ALL: 134 cusparsestruct->format = format; 135 break; 136 default: 137 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 138 } 139 PetscFunctionReturn(0); 140 } 141 142 /*@ 143 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 144 operation. Only the MatMult operation can use different GPU storage formats 145 for MPIAIJCUSPARSE matrices. 146 Not Collective 147 148 Input Parameters: 149 + A - Matrix of type SEQAIJCUSPARSE 150 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 151 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 152 153 Output Parameter: 154 155 Level: intermediate 156 157 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 158 @*/ 159 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 160 { 161 PetscErrorCode ierr; 162 163 PetscFunctionBegin; 164 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 165 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 166 PetscFunctionReturn(0); 167 } 168 169 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 170 { 171 PetscErrorCode ierr; 172 MatCUSPARSEStorageFormat format; 173 PetscBool flg; 174 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 175 176 PetscFunctionBegin; 177 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 178 if (A->factortype==MAT_FACTOR_NONE) { 179 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 180 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 181 if (flg) { 182 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 183 } 184 } 185 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 186 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 187 if (flg) { 188 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 189 } 190 ierr = PetscOptionsTail();CHKERRQ(ierr); 191 PetscFunctionReturn(0); 192 193 } 194 195 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 196 { 197 PetscErrorCode ierr; 198 199 PetscFunctionBegin; 200 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 201 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 202 PetscFunctionReturn(0); 203 } 204 205 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 206 { 207 PetscErrorCode ierr; 208 209 PetscFunctionBegin; 210 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 211 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 212 PetscFunctionReturn(0); 213 } 214 215 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 216 { 217 PetscErrorCode ierr; 218 219 PetscFunctionBegin; 220 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 221 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 222 PetscFunctionReturn(0); 223 } 224 225 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 226 { 227 PetscErrorCode ierr; 228 229 PetscFunctionBegin; 230 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 231 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 232 PetscFunctionReturn(0); 233 } 234 235 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 236 { 237 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 238 PetscInt n = A->rmap->n; 239 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 240 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 241 cusparseStatus_t stat; 242 const PetscInt *ai = a->i,*aj = a->j,*vi; 243 const MatScalar *aa = a->a,*v; 244 PetscInt *AiLo, *AjLo; 245 PetscScalar *AALo; 246 PetscInt i,nz, nzLower, offset, rowOffset; 247 PetscErrorCode ierr; 248 cudaError_t cerr; 249 250 PetscFunctionBegin; 251 if (!n) PetscFunctionReturn(0); 252 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 253 try { 254 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 255 nzLower=n+ai[n]-ai[1]; 256 257 /* Allocate Space for the lower triangular matrix */ 258 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 259 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 260 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 261 262 /* Fill the lower triangular matrix */ 263 AiLo[0] = (PetscInt) 0; 264 AiLo[n] = nzLower; 265 AjLo[0] = (PetscInt) 0; 266 AALo[0] = (MatScalar) 1.0; 267 v = aa; 268 vi = aj; 269 offset = 1; 270 rowOffset= 1; 271 for (i=1; i<n; i++) { 272 nz = ai[i+1] - ai[i]; 273 /* additional 1 for the term on the diagonal */ 274 AiLo[i] = rowOffset; 275 rowOffset += nz+1; 276 277 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 278 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 279 280 offset += nz; 281 AjLo[offset] = (PetscInt) i; 282 AALo[offset] = (MatScalar) 1.0; 283 offset += 1; 284 285 v += nz; 286 vi += nz; 287 } 288 289 /* allocate space for the triangular factor information */ 290 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 291 292 /* Create the matrix description */ 293 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 294 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 295 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 296 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 297 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 298 299 /* Create the solve analysis information */ 300 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 301 302 /* set the operation */ 303 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 304 305 /* set the matrix */ 306 loTriFactor->csrMat = new CsrMatrix; 307 loTriFactor->csrMat->num_rows = n; 308 loTriFactor->csrMat->num_cols = n; 309 loTriFactor->csrMat->num_entries = nzLower; 310 311 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 312 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 313 314 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 315 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 316 317 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 318 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 319 320 /* perform the solve analysis */ 321 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 322 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 323 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 324 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 325 326 /* assign the pointer. Is this really necessary? */ 327 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 328 329 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 330 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 331 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 332 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 333 } catch(char *ex) { 334 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 335 } 336 } 337 PetscFunctionReturn(0); 338 } 339 340 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 341 { 342 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 343 PetscInt n = A->rmap->n; 344 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 345 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 346 cusparseStatus_t stat; 347 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 348 const MatScalar *aa = a->a,*v; 349 PetscInt *AiUp, *AjUp; 350 PetscScalar *AAUp; 351 PetscInt i,nz, nzUpper, offset; 352 PetscErrorCode ierr; 353 cudaError_t cerr; 354 355 PetscFunctionBegin; 356 if (!n) PetscFunctionReturn(0); 357 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 358 try { 359 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 360 nzUpper = adiag[0]-adiag[n]; 361 362 /* Allocate Space for the upper triangular matrix */ 363 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 364 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 365 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 366 367 /* Fill the upper triangular matrix */ 368 AiUp[0]=(PetscInt) 0; 369 AiUp[n]=nzUpper; 370 offset = nzUpper; 371 for (i=n-1; i>=0; i--) { 372 v = aa + adiag[i+1] + 1; 373 vi = aj + adiag[i+1] + 1; 374 375 /* number of elements NOT on the diagonal */ 376 nz = adiag[i] - adiag[i+1]-1; 377 378 /* decrement the offset */ 379 offset -= (nz+1); 380 381 /* first, set the diagonal elements */ 382 AjUp[offset] = (PetscInt) i; 383 AAUp[offset] = (MatScalar)1./v[nz]; 384 AiUp[i] = AiUp[i+1] - (nz+1); 385 386 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 387 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 388 } 389 390 /* allocate space for the triangular factor information */ 391 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 392 393 /* Create the matrix description */ 394 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 395 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 396 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 397 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 398 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 399 400 /* Create the solve analysis information */ 401 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 402 403 /* set the operation */ 404 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 405 406 /* set the matrix */ 407 upTriFactor->csrMat = new CsrMatrix; 408 upTriFactor->csrMat->num_rows = n; 409 upTriFactor->csrMat->num_cols = n; 410 upTriFactor->csrMat->num_entries = nzUpper; 411 412 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 413 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 414 415 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 416 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 417 418 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 419 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 420 421 /* perform the solve analysis */ 422 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 423 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 424 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 425 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 426 427 /* assign the pointer. Is this really necessary? */ 428 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 429 430 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 431 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 432 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 433 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 434 } catch(char *ex) { 435 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 436 } 437 } 438 PetscFunctionReturn(0); 439 } 440 441 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 442 { 443 PetscErrorCode ierr; 444 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 445 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 446 IS isrow = a->row,iscol = a->icol; 447 PetscBool row_identity,col_identity; 448 const PetscInt *r,*c; 449 PetscInt n = A->rmap->n; 450 451 PetscFunctionBegin; 452 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 453 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 454 455 cusparseTriFactors->workVector = new THRUSTARRAY(n); 456 cusparseTriFactors->nnz=a->nz; 457 458 A->offloadmask = PETSC_OFFLOAD_BOTH; 459 /* lower triangular indices */ 460 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 461 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 462 if (!row_identity) { 463 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 464 cusparseTriFactors->rpermIndices->assign(r, r+n); 465 } 466 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 467 468 /* upper triangular indices */ 469 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 470 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 471 if (!col_identity) { 472 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 473 cusparseTriFactors->cpermIndices->assign(c, c+n); 474 } 475 476 if (!row_identity && !col_identity) { 477 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 478 } else if(!row_identity) { 479 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 480 } else if(!col_identity) { 481 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 482 } 483 484 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 485 PetscFunctionReturn(0); 486 } 487 488 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 489 { 490 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 491 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 492 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 493 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 494 cusparseStatus_t stat; 495 PetscErrorCode ierr; 496 cudaError_t cerr; 497 PetscInt *AiUp, *AjUp; 498 PetscScalar *AAUp; 499 PetscScalar *AALo; 500 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 501 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 502 const PetscInt *ai = b->i,*aj = b->j,*vj; 503 const MatScalar *aa = b->a,*v; 504 505 PetscFunctionBegin; 506 if (!n) PetscFunctionReturn(0); 507 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 508 try { 509 /* Allocate Space for the upper triangular matrix */ 510 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 511 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 512 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 513 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 514 515 /* Fill the upper triangular matrix */ 516 AiUp[0]=(PetscInt) 0; 517 AiUp[n]=nzUpper; 518 offset = 0; 519 for (i=0; i<n; i++) { 520 /* set the pointers */ 521 v = aa + ai[i]; 522 vj = aj + ai[i]; 523 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 524 525 /* first, set the diagonal elements */ 526 AjUp[offset] = (PetscInt) i; 527 AAUp[offset] = (MatScalar)1.0/v[nz]; 528 AiUp[i] = offset; 529 AALo[offset] = (MatScalar)1.0/v[nz]; 530 531 offset+=1; 532 if (nz>0) { 533 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 534 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 535 for (j=offset; j<offset+nz; j++) { 536 AAUp[j] = -AAUp[j]; 537 AALo[j] = AAUp[j]/v[nz]; 538 } 539 offset+=nz; 540 } 541 } 542 543 /* allocate space for the triangular factor information */ 544 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 545 546 /* Create the matrix description */ 547 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 548 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 549 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 550 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 551 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 552 553 /* Create the solve analysis information */ 554 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 555 556 /* set the operation */ 557 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 558 559 /* set the matrix */ 560 upTriFactor->csrMat = new CsrMatrix; 561 upTriFactor->csrMat->num_rows = A->rmap->n; 562 upTriFactor->csrMat->num_cols = A->cmap->n; 563 upTriFactor->csrMat->num_entries = a->nz; 564 565 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 566 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 567 568 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 569 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 570 571 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 572 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 573 574 /* perform the solve analysis */ 575 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 576 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 577 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 578 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 579 580 /* assign the pointer. Is this really necessary? */ 581 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 582 583 /* allocate space for the triangular factor information */ 584 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 585 586 /* Create the matrix description */ 587 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 588 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 589 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 590 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 591 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 592 593 /* Create the solve analysis information */ 594 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 595 596 /* set the operation */ 597 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 598 599 /* set the matrix */ 600 loTriFactor->csrMat = new CsrMatrix; 601 loTriFactor->csrMat->num_rows = A->rmap->n; 602 loTriFactor->csrMat->num_cols = A->cmap->n; 603 loTriFactor->csrMat->num_entries = a->nz; 604 605 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 606 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 607 608 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 609 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 610 611 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 612 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 613 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 614 615 /* perform the solve analysis */ 616 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 617 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 618 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 619 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 620 621 /* assign the pointer. Is this really necessary? */ 622 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 623 624 A->offloadmask = PETSC_OFFLOAD_BOTH; 625 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 626 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 627 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 628 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 629 } catch(char *ex) { 630 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 631 } 632 } 633 PetscFunctionReturn(0); 634 } 635 636 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 637 { 638 PetscErrorCode ierr; 639 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 640 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 641 IS ip = a->row; 642 const PetscInt *rip; 643 PetscBool perm_identity; 644 PetscInt n = A->rmap->n; 645 646 PetscFunctionBegin; 647 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 648 cusparseTriFactors->workVector = new THRUSTARRAY(n); 649 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 650 651 /* lower triangular indices */ 652 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 653 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 654 if (!perm_identity) { 655 IS iip; 656 const PetscInt *irip; 657 658 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 659 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 660 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 661 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 662 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 663 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 664 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 665 ierr = ISDestroy(&iip);CHKERRQ(ierr); 666 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 667 } 668 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 669 PetscFunctionReturn(0); 670 } 671 672 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 673 { 674 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 675 IS isrow = b->row,iscol = b->col; 676 PetscBool row_identity,col_identity; 677 PetscErrorCode ierr; 678 679 PetscFunctionBegin; 680 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 681 /* determine which version of MatSolve needs to be used. */ 682 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 683 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 684 if (row_identity && col_identity) { 685 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 686 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 687 B->ops->matsolve = NULL; 688 B->ops->matsolvetranspose = NULL; 689 } else { 690 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 691 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 692 B->ops->matsolve = NULL; 693 B->ops->matsolvetranspose = NULL; 694 } 695 696 /* get the triangular factors */ 697 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 698 PetscFunctionReturn(0); 699 } 700 701 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 702 { 703 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 704 IS ip = b->row; 705 PetscBool perm_identity; 706 PetscErrorCode ierr; 707 708 PetscFunctionBegin; 709 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 710 711 /* determine which version of MatSolve needs to be used. */ 712 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 713 if (perm_identity) { 714 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 715 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 716 B->ops->matsolve = NULL; 717 B->ops->matsolvetranspose = NULL; 718 } else { 719 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 720 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 721 B->ops->matsolve = NULL; 722 B->ops->matsolvetranspose = NULL; 723 } 724 725 /* get the triangular factors */ 726 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 727 PetscFunctionReturn(0); 728 } 729 730 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 731 { 732 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 733 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 734 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 735 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 736 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 737 cusparseStatus_t stat; 738 cusparseIndexBase_t indexBase; 739 cusparseMatrixType_t matrixType; 740 cusparseFillMode_t fillMode; 741 cusparseDiagType_t diagType; 742 743 PetscFunctionBegin; 744 745 /*********************************************/ 746 /* Now the Transpose of the Lower Tri Factor */ 747 /*********************************************/ 748 749 /* allocate space for the transpose of the lower triangular factor */ 750 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 751 752 /* set the matrix descriptors of the lower triangular factor */ 753 matrixType = cusparseGetMatType(loTriFactor->descr); 754 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 755 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 756 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 757 diagType = cusparseGetMatDiagType(loTriFactor->descr); 758 759 /* Create the matrix description */ 760 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 761 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 762 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 763 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 764 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 765 766 /* Create the solve analysis information */ 767 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 768 769 /* set the operation */ 770 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 771 772 /* allocate GPU space for the CSC of the lower triangular factor*/ 773 loTriFactorT->csrMat = new CsrMatrix; 774 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 775 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 776 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 777 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 778 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 779 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 780 781 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 782 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 783 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 784 loTriFactor->csrMat->values->data().get(), 785 loTriFactor->csrMat->row_offsets->data().get(), 786 loTriFactor->csrMat->column_indices->data().get(), 787 loTriFactorT->csrMat->values->data().get(), 788 loTriFactorT->csrMat->column_indices->data().get(), 789 loTriFactorT->csrMat->row_offsets->data().get(), 790 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 791 792 /* perform the solve analysis on the transposed matrix */ 793 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 794 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 795 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 796 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 797 loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 798 799 /* assign the pointer. Is this really necessary? */ 800 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 801 802 /*********************************************/ 803 /* Now the Transpose of the Upper Tri Factor */ 804 /*********************************************/ 805 806 /* allocate space for the transpose of the upper triangular factor */ 807 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 808 809 /* set the matrix descriptors of the upper triangular factor */ 810 matrixType = cusparseGetMatType(upTriFactor->descr); 811 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 812 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 813 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 814 diagType = cusparseGetMatDiagType(upTriFactor->descr); 815 816 /* Create the matrix description */ 817 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 818 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 819 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 820 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 821 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 822 823 /* Create the solve analysis information */ 824 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 825 826 /* set the operation */ 827 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 828 829 /* allocate GPU space for the CSC of the upper triangular factor*/ 830 upTriFactorT->csrMat = new CsrMatrix; 831 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 832 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 833 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 834 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 835 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 836 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 837 838 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 839 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 840 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 841 upTriFactor->csrMat->values->data().get(), 842 upTriFactor->csrMat->row_offsets->data().get(), 843 upTriFactor->csrMat->column_indices->data().get(), 844 upTriFactorT->csrMat->values->data().get(), 845 upTriFactorT->csrMat->column_indices->data().get(), 846 upTriFactorT->csrMat->row_offsets->data().get(), 847 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 848 849 /* perform the solve analysis on the transposed matrix */ 850 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 851 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 852 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 853 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 854 upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 855 856 /* assign the pointer. Is this really necessary? */ 857 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 858 PetscFunctionReturn(0); 859 } 860 861 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 862 { 863 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 864 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 865 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 866 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 867 cusparseStatus_t stat; 868 cusparseIndexBase_t indexBase; 869 cudaError_t err; 870 PetscErrorCode ierr; 871 872 PetscFunctionBegin; 873 874 /* allocate space for the triangular factor information */ 875 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 876 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 877 indexBase = cusparseGetMatIndexBase(matstruct->descr); 878 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 879 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 880 881 /* set alpha and beta */ 882 err = cudaMalloc((void **)&(matstructT->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 883 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 884 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 885 err = cudaMemcpy(matstructT->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 886 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 887 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 888 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 889 890 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 891 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 892 CsrMatrix *matrixT= new CsrMatrix; 893 thrust::device_vector<int> *rowoffsets_gpu; 894 const int ncomprow = matstruct->cprowIndices->size(); 895 thrust::host_vector<int> cprow(ncomprow); 896 cprow = *matstruct->cprowIndices; // GPU --> CPU 897 matrixT->num_rows = A->cmap->n; 898 matrixT->num_cols = A->rmap->n; 899 matrixT->num_entries = a->nz; 900 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 901 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 902 matrixT->values = new THRUSTARRAY(a->nz); 903 PetscInt k,i; 904 thrust::host_vector<int> rowst_host(A->rmap->n+1); 905 906 /* expand compress rows, which is forced in constructor (MatSeqAIJCUSPARSECopyToGPU) */ 907 rowst_host[0] = 0; 908 for (k = 0, i = 0; i < A->rmap->n ; i++) { 909 if (k < ncomprow && i==cprow[k]) { 910 rowst_host[i+1] = a->i[i+1]; 911 k++; 912 } else rowst_host[i+1] = rowst_host[i]; 913 } 914 if (k!=ncomprow) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"%D != %D",k,ncomprow); 915 rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 916 *rowoffsets_gpu = rowst_host; // CPU --> GPU 917 918 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 919 indexBase = cusparseGetMatIndexBase(matstruct->descr); 920 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 921 A->cmap->n, matrix->num_entries, 922 matrix->values->data().get(), 923 rowoffsets_gpu->data().get(), 924 matrix->column_indices->data().get(), 925 matrixT->values->data().get(), 926 matrixT->column_indices->data().get(), 927 matrixT->row_offsets->data().get(), 928 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 929 930 /* assign the pointer */ 931 matstructT->mat = matrixT; 932 ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 933 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 934 /* First convert HYB to CSR */ 935 CsrMatrix *temp= new CsrMatrix; 936 temp->num_rows = A->rmap->n; 937 temp->num_cols = A->cmap->n; 938 temp->num_entries = a->nz; 939 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 940 temp->column_indices = new THRUSTINTARRAY32(a->nz); 941 temp->values = new THRUSTARRAY(a->nz); 942 943 944 stat = cusparse_hyb2csr(cusparsestruct->handle, 945 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 946 temp->values->data().get(), 947 temp->row_offsets->data().get(), 948 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 949 950 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 951 CsrMatrix *tempT= new CsrMatrix; 952 tempT->num_rows = A->rmap->n; 953 tempT->num_cols = A->cmap->n; 954 tempT->num_entries = a->nz; 955 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 956 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 957 tempT->values = new THRUSTARRAY(a->nz); 958 959 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 960 temp->num_cols, temp->num_entries, 961 temp->values->data().get(), 962 temp->row_offsets->data().get(), 963 temp->column_indices->data().get(), 964 tempT->values->data().get(), 965 tempT->column_indices->data().get(), 966 tempT->row_offsets->data().get(), 967 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 968 969 /* Last, convert CSC to HYB */ 970 cusparseHybMat_t hybMat; 971 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 972 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 973 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 974 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 975 matstructT->descr, tempT->values->data().get(), 976 tempT->row_offsets->data().get(), 977 tempT->column_indices->data().get(), 978 hybMat, 0, partition);CHKERRCUSPARSE(stat); 979 980 /* assign the pointer */ 981 matstructT->mat = hybMat; 982 ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr); 983 984 /* delete temporaries */ 985 if (tempT) { 986 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 987 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 988 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 989 delete (CsrMatrix*) tempT; 990 } 991 if (temp) { 992 if (temp->values) delete (THRUSTARRAY*) temp->values; 993 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 994 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 995 delete (CsrMatrix*) temp; 996 } 997 } 998 /* assign the compressed row indices */ 999 matstructT->cprowIndices = new THRUSTINTARRAY; 1000 matstructT->cprowIndices->resize(A->cmap->n); 1001 thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end()); 1002 /* assign the pointer */ 1003 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1004 PetscFunctionReturn(0); 1005 } 1006 1007 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1008 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1009 { 1010 PetscInt n = xx->map->n; 1011 const PetscScalar *barray; 1012 PetscScalar *xarray; 1013 thrust::device_ptr<const PetscScalar> bGPU; 1014 thrust::device_ptr<PetscScalar> xGPU; 1015 cusparseStatus_t stat; 1016 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1017 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1018 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1019 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1020 PetscErrorCode ierr; 1021 cudaError_t cerr; 1022 1023 PetscFunctionBegin; 1024 /* Analyze the matrix and create the transpose ... on the fly */ 1025 if (!loTriFactorT && !upTriFactorT) { 1026 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1027 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1028 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1029 } 1030 1031 /* Get the GPU pointers */ 1032 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1033 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1034 xGPU = thrust::device_pointer_cast(xarray); 1035 bGPU = thrust::device_pointer_cast(barray); 1036 1037 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1038 /* First, reorder with the row permutation */ 1039 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1040 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1041 xGPU); 1042 1043 /* First, solve U */ 1044 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1045 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1046 upTriFactorT->csrMat->values->data().get(), 1047 upTriFactorT->csrMat->row_offsets->data().get(), 1048 upTriFactorT->csrMat->column_indices->data().get(), 1049 upTriFactorT->solveInfo, 1050 xarray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1051 1052 /* Then, solve L */ 1053 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1054 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1055 loTriFactorT->csrMat->values->data().get(), 1056 loTriFactorT->csrMat->row_offsets->data().get(), 1057 loTriFactorT->csrMat->column_indices->data().get(), 1058 loTriFactorT->solveInfo, 1059 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1060 1061 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1062 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1063 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1064 tempGPU->begin()); 1065 1066 /* Copy the temporary to the full solution. */ 1067 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1068 1069 /* restore */ 1070 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1071 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1072 cerr = WaitForGPU();CHKERRCUDA(cerr); 1073 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1074 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1075 PetscFunctionReturn(0); 1076 } 1077 1078 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1079 { 1080 const PetscScalar *barray; 1081 PetscScalar *xarray; 1082 cusparseStatus_t stat; 1083 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1084 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1085 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1086 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1087 PetscErrorCode ierr; 1088 cudaError_t cerr; 1089 1090 PetscFunctionBegin; 1091 /* Analyze the matrix and create the transpose ... on the fly */ 1092 if (!loTriFactorT && !upTriFactorT) { 1093 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1094 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1095 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1096 } 1097 1098 /* Get the GPU pointers */ 1099 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1100 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1101 1102 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1103 /* First, solve U */ 1104 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1105 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1106 upTriFactorT->csrMat->values->data().get(), 1107 upTriFactorT->csrMat->row_offsets->data().get(), 1108 upTriFactorT->csrMat->column_indices->data().get(), 1109 upTriFactorT->solveInfo, 1110 barray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1111 1112 /* Then, solve L */ 1113 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1114 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1115 loTriFactorT->csrMat->values->data().get(), 1116 loTriFactorT->csrMat->row_offsets->data().get(), 1117 loTriFactorT->csrMat->column_indices->data().get(), 1118 loTriFactorT->solveInfo, 1119 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1120 1121 /* restore */ 1122 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1123 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1124 cerr = WaitForGPU();CHKERRCUDA(cerr); 1125 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1126 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1127 PetscFunctionReturn(0); 1128 } 1129 1130 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1131 { 1132 const PetscScalar *barray; 1133 PetscScalar *xarray; 1134 thrust::device_ptr<const PetscScalar> bGPU; 1135 thrust::device_ptr<PetscScalar> xGPU; 1136 cusparseStatus_t stat; 1137 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1138 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1139 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1140 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1141 PetscErrorCode ierr; 1142 cudaError_t cerr; 1143 1144 PetscFunctionBegin; 1145 1146 /* Get the GPU pointers */ 1147 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1148 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1149 xGPU = thrust::device_pointer_cast(xarray); 1150 bGPU = thrust::device_pointer_cast(barray); 1151 1152 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1153 /* First, reorder with the row permutation */ 1154 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1155 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1156 tempGPU->begin()); 1157 1158 /* Next, solve L */ 1159 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1160 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1161 loTriFactor->csrMat->values->data().get(), 1162 loTriFactor->csrMat->row_offsets->data().get(), 1163 loTriFactor->csrMat->column_indices->data().get(), 1164 loTriFactor->solveInfo, 1165 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1166 1167 /* Then, solve U */ 1168 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1169 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1170 upTriFactor->csrMat->values->data().get(), 1171 upTriFactor->csrMat->row_offsets->data().get(), 1172 upTriFactor->csrMat->column_indices->data().get(), 1173 upTriFactor->solveInfo, 1174 xarray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1175 1176 /* Last, reorder with the column permutation */ 1177 thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1178 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1179 xGPU); 1180 1181 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1182 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1183 cerr = WaitForGPU();CHKERRCUDA(cerr); 1184 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1185 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1186 PetscFunctionReturn(0); 1187 } 1188 1189 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1190 { 1191 const PetscScalar *barray; 1192 PetscScalar *xarray; 1193 cusparseStatus_t stat; 1194 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1195 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1196 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1197 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1198 PetscErrorCode ierr; 1199 cudaError_t cerr; 1200 1201 PetscFunctionBegin; 1202 /* Get the GPU pointers */ 1203 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1204 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1205 1206 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1207 /* First, solve L */ 1208 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1209 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1210 loTriFactor->csrMat->values->data().get(), 1211 loTriFactor->csrMat->row_offsets->data().get(), 1212 loTriFactor->csrMat->column_indices->data().get(), 1213 loTriFactor->solveInfo, 1214 barray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1215 1216 /* Next, solve U */ 1217 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1218 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1219 upTriFactor->csrMat->values->data().get(), 1220 upTriFactor->csrMat->row_offsets->data().get(), 1221 upTriFactor->csrMat->column_indices->data().get(), 1222 upTriFactor->solveInfo, 1223 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1224 1225 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1226 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1227 cerr = WaitForGPU();CHKERRCUDA(cerr); 1228 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1229 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1230 PetscFunctionReturn(0); 1231 } 1232 1233 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1234 { 1235 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1236 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1237 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1238 PetscInt m = A->rmap->n,*ii,*ridx; 1239 PetscErrorCode ierr; 1240 cusparseStatus_t stat; 1241 cudaError_t err; 1242 1243 PetscFunctionBegin; 1244 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1245 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1246 if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1247 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1248 /* copy values only */ 1249 matrix->values->assign(a->a, a->a+a->nz); 1250 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1251 } else { 1252 MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format); 1253 try { 1254 cusparsestruct->nonzerorow=0; 1255 for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0); 1256 1257 if (a->compressedrow.use) { 1258 m = a->compressedrow.nrows; 1259 ii = a->compressedrow.i; 1260 ridx = a->compressedrow.rindex; 1261 } else { 1262 /* Forcing compressed row on the GPU */ 1263 int k=0; 1264 ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr); 1265 ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr); 1266 ii[0]=0; 1267 for (int j = 0; j<m; j++) { 1268 if ((a->i[j+1]-a->i[j])>0) { 1269 ii[k] = a->i[j]; 1270 ridx[k]= j; 1271 k++; 1272 } 1273 } 1274 ii[cusparsestruct->nonzerorow] = a->nz; 1275 m = cusparsestruct->nonzerorow; 1276 } 1277 1278 /* allocate space for the triangular factor information */ 1279 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1280 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1281 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1282 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1283 1284 err = cudaMalloc((void **)&(matstruct->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 1285 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1286 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1287 err = cudaMemcpy(matstruct->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1288 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1289 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1290 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1291 1292 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1293 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1294 /* set the matrix */ 1295 CsrMatrix *matrix= new CsrMatrix; 1296 matrix->num_rows = m; 1297 matrix->num_cols = A->cmap->n; 1298 matrix->num_entries = a->nz; 1299 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1300 matrix->row_offsets->assign(ii, ii + m+1); 1301 1302 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1303 matrix->column_indices->assign(a->j, a->j+a->nz); 1304 1305 matrix->values = new THRUSTARRAY(a->nz); 1306 matrix->values->assign(a->a, a->a+a->nz); 1307 1308 /* assign the pointer */ 1309 matstruct->mat = matrix; 1310 1311 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1312 CsrMatrix *matrix= new CsrMatrix; 1313 matrix->num_rows = m; 1314 matrix->num_cols = A->cmap->n; 1315 matrix->num_entries = a->nz; 1316 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1317 matrix->row_offsets->assign(ii, ii + m+1); 1318 1319 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1320 matrix->column_indices->assign(a->j, a->j+a->nz); 1321 1322 matrix->values = new THRUSTARRAY(a->nz); 1323 matrix->values->assign(a->a, a->a+a->nz); 1324 1325 cusparseHybMat_t hybMat; 1326 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1327 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1328 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1329 stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1330 matstruct->descr, matrix->values->data().get(), 1331 matrix->row_offsets->data().get(), 1332 matrix->column_indices->data().get(), 1333 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1334 /* assign the pointer */ 1335 matstruct->mat = hybMat; 1336 1337 if (matrix) { 1338 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1339 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1340 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1341 delete (CsrMatrix*)matrix; 1342 } 1343 } 1344 1345 /* assign the compressed row indices */ 1346 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1347 matstruct->cprowIndices->assign(ridx,ridx+m); 1348 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+m*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1349 1350 /* assign the pointer */ 1351 cusparsestruct->mat = matstruct; 1352 1353 if (!a->compressedrow.use) { 1354 ierr = PetscFree(ii);CHKERRQ(ierr); 1355 ierr = PetscFree(ridx);CHKERRQ(ierr); 1356 } 1357 cusparsestruct->workVector = new THRUSTARRAY(m); 1358 } catch(char *ex) { 1359 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1360 } 1361 cusparsestruct->nonzerostate = A->nonzerostate; 1362 } 1363 err = WaitForGPU();CHKERRCUDA(err); 1364 A->offloadmask = PETSC_OFFLOAD_BOTH; 1365 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1366 } 1367 PetscFunctionReturn(0); 1368 } 1369 1370 struct VecCUDAPlusEquals 1371 { 1372 template <typename Tuple> 1373 __host__ __device__ 1374 void operator()(Tuple t) 1375 { 1376 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1377 } 1378 }; 1379 1380 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1381 { 1382 PetscErrorCode ierr; 1383 1384 PetscFunctionBegin; 1385 ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr); 1386 PetscFunctionReturn(0); 1387 } 1388 1389 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1390 { 1391 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1392 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1393 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1394 const PetscScalar *xarray; 1395 PetscScalar *yarray; 1396 PetscErrorCode ierr; 1397 cudaError_t cerr; 1398 cusparseStatus_t stat; 1399 1400 PetscFunctionBegin; 1401 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1402 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1403 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1404 if (!matstructT) { 1405 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1406 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1407 } 1408 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1409 ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1410 if (yy->map->n) { 1411 PetscInt n = yy->map->n; 1412 cudaError_t err; 1413 err = cudaMemset(yarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err); /* hack to fix numerical errors from reading output vector yy, apparently */ 1414 } 1415 1416 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1417 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1418 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1419 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1420 mat->num_rows, mat->num_cols, 1421 mat->num_entries, matstructT->alpha, matstructT->descr, 1422 mat->values->data().get(), mat->row_offsets->data().get(), 1423 mat->column_indices->data().get(), xarray, matstructT->beta_zero, 1424 yarray);CHKERRCUSPARSE(stat); 1425 } else { 1426 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1427 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1428 matstructT->alpha, matstructT->descr, hybMat, 1429 xarray, matstructT->beta_zero, 1430 yarray);CHKERRCUSPARSE(stat); 1431 } 1432 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1433 ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1434 cerr = WaitForGPU();CHKERRCUDA(cerr); 1435 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1436 ierr = PetscLogGpuFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1437 PetscFunctionReturn(0); 1438 } 1439 1440 1441 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1442 { 1443 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1444 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1445 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 1446 const PetscScalar *xarray; 1447 PetscScalar *zarray,*dptr,*beta; 1448 PetscErrorCode ierr; 1449 cudaError_t cerr; 1450 cusparseStatus_t stat; 1451 PetscBool cmpr; /* if the matrix has been compressed (zero rows) */ 1452 1453 PetscFunctionBegin; 1454 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1455 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1456 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1457 try { 1458 cmpr = (PetscBool)(cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n)); 1459 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1460 if (yy && !cmpr) { /* MatMultAdd with noncompressed storage -> need uptodate zz vector */ 1461 ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr); 1462 } else { 1463 ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1464 } 1465 dptr = cmpr ? zarray : cusparsestruct->workVector->data().get(); 1466 beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero; 1467 1468 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1469 /* csr_spmv is multiply add */ 1470 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1471 /* here we need to be careful to set the number of rows in the multiply to the 1472 number of compressed rows in the matrix ... which is equivalent to the 1473 size of the workVector */ 1474 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1475 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1476 mat->num_rows, mat->num_cols, 1477 mat->num_entries, matstruct->alpha, matstruct->descr, 1478 mat->values->data().get(), mat->row_offsets->data().get(), 1479 mat->column_indices->data().get(), xarray, beta, 1480 dptr);CHKERRCUSPARSE(stat); 1481 } else { 1482 if (cusparsestruct->workVector->size()) { 1483 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1484 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1485 matstruct->alpha, matstruct->descr, hybMat, 1486 xarray, beta, 1487 dptr);CHKERRCUSPARSE(stat); 1488 } 1489 } 1490 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1491 1492 if (yy) { 1493 if (dptr != zarray) { 1494 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1495 } else if (zz != yy) { 1496 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); 1497 } 1498 } else if (dptr != zarray) { 1499 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 1500 } 1501 /* scatter the data from the temporary into the full vector with a += operation */ 1502 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1503 if (dptr != zarray) { 1504 thrust::device_ptr<PetscScalar> zptr; 1505 1506 zptr = thrust::device_pointer_cast(zarray); 1507 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 1508 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1509 VecCUDAPlusEquals()); 1510 } 1511 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1512 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1513 if (yy && !cmpr) { 1514 ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr); 1515 } else { 1516 ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1517 } 1518 } catch(char *ex) { 1519 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1520 } 1521 cerr = WaitForGPU();CHKERRCUDA(cerr); 1522 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1523 PetscFunctionReturn(0); 1524 } 1525 1526 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1527 { 1528 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1529 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1530 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1531 const PetscScalar *xarray; 1532 PetscScalar *zarray,*dptr,*beta; 1533 PetscErrorCode ierr; 1534 cudaError_t cerr; 1535 cusparseStatus_t stat; 1536 1537 PetscFunctionBegin; 1538 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1539 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1540 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1541 if (!matstructT) { 1542 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1543 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1544 } 1545 1546 try { 1547 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1548 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1549 ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr); 1550 dptr = cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->cmap->n) ? zarray : cusparsestruct->workVector->data().get(); 1551 beta = (yy == zz && dptr == zarray) ? matstructT->beta_one : matstructT->beta_zero; 1552 1553 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1554 /* multiply add with matrix transpose */ 1555 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1556 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1557 /* here we need to be careful to set the number of rows in the multiply to the 1558 number of compressed rows in the matrix ... which is equivalent to the 1559 size of the workVector */ 1560 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1561 mat->num_rows, mat->num_cols, 1562 mat->num_entries, matstructT->alpha, matstructT->descr, 1563 mat->values->data().get(), mat->row_offsets->data().get(), 1564 mat->column_indices->data().get(), xarray, beta, 1565 dptr);CHKERRCUSPARSE(stat); 1566 } else { 1567 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1568 if (cusparsestruct->workVector->size()) { 1569 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1570 matstructT->alpha, matstructT->descr, hybMat, 1571 xarray, beta, 1572 dptr);CHKERRCUSPARSE(stat); 1573 } 1574 } 1575 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1576 1577 if (dptr != zarray) { 1578 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1579 } else if (zz != yy) { 1580 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); 1581 } 1582 /* scatter the data from the temporary into the full vector with a += operation */ 1583 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1584 if (dptr != zarray) { 1585 thrust::device_ptr<PetscScalar> zptr; 1586 1587 zptr = thrust::device_pointer_cast(zarray); 1588 1589 /* scatter the data from the temporary into the full vector with a += operation */ 1590 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))), 1591 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n, 1592 VecCUDAPlusEquals()); 1593 } 1594 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1595 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1596 ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr); 1597 } catch(char *ex) { 1598 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1599 } 1600 cerr = WaitForGPU();CHKERRCUDA(cerr); /* is this needed? just for yy==0 in Mult */ 1601 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1602 PetscFunctionReturn(0); 1603 } 1604 1605 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1606 { 1607 PetscErrorCode ierr; 1608 1609 PetscFunctionBegin; 1610 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1611 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1612 if (A->factortype == MAT_FACTOR_NONE) { 1613 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1614 } 1615 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1616 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1617 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1618 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1619 PetscFunctionReturn(0); 1620 } 1621 1622 /* --------------------------------------------------------------------------------*/ 1623 /*@ 1624 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1625 (the default parallel PETSc format). This matrix will ultimately pushed down 1626 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1627 assembly performance the user should preallocate the matrix storage by setting 1628 the parameter nz (or the array nnz). By setting these parameters accurately, 1629 performance during matrix assembly can be increased by more than a factor of 50. 1630 1631 Collective 1632 1633 Input Parameters: 1634 + comm - MPI communicator, set to PETSC_COMM_SELF 1635 . m - number of rows 1636 . n - number of columns 1637 . nz - number of nonzeros per row (same for all rows) 1638 - nnz - array containing the number of nonzeros in the various rows 1639 (possibly different for each row) or NULL 1640 1641 Output Parameter: 1642 . A - the matrix 1643 1644 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1645 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1646 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1647 1648 Notes: 1649 If nnz is given then nz is ignored 1650 1651 The AIJ format (also called the Yale sparse matrix format or 1652 compressed row storage), is fully compatible with standard Fortran 77 1653 storage. That is, the stored row and column indices can begin at 1654 either one (as in Fortran) or zero. See the users' manual for details. 1655 1656 Specify the preallocated storage with either nz or nnz (not both). 1657 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1658 allocation. For large problems you MUST preallocate memory or you 1659 will get TERRIBLE performance, see the users' manual chapter on matrices. 1660 1661 By default, this format uses inodes (identical nodes) when possible, to 1662 improve numerical efficiency of matrix-vector products and solves. We 1663 search for consecutive rows with the same nonzero structure, thereby 1664 reusing matrix information to achieve increased efficiency. 1665 1666 Level: intermediate 1667 1668 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1669 @*/ 1670 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1671 { 1672 PetscErrorCode ierr; 1673 1674 PetscFunctionBegin; 1675 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1676 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1677 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1678 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1679 PetscFunctionReturn(0); 1680 } 1681 1682 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1683 { 1684 PetscErrorCode ierr; 1685 1686 PetscFunctionBegin; 1687 if (A->factortype==MAT_FACTOR_NONE) { 1688 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) { 1689 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 1690 } 1691 } else { 1692 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 1693 } 1694 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1695 PetscFunctionReturn(0); 1696 } 1697 1698 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 1699 { 1700 PetscErrorCode ierr; 1701 Mat C; 1702 cusparseStatus_t stat; 1703 cusparseHandle_t handle=0; 1704 1705 PetscFunctionBegin; 1706 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 1707 C = *B; 1708 ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr); 1709 ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr); 1710 1711 /* inject CUSPARSE-specific stuff */ 1712 if (C->factortype==MAT_FACTOR_NONE) { 1713 /* you cannot check the inode.use flag here since the matrix was just created. 1714 now build a GPU matrix data structure */ 1715 C->spptr = new Mat_SeqAIJCUSPARSE; 1716 ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat = 0; 1717 ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0; 1718 ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector = 0; 1719 ((Mat_SeqAIJCUSPARSE*)C->spptr)->format = MAT_CUSPARSE_CSR; 1720 ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream = 0; 1721 stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat); 1722 ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle = handle; 1723 ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0; 1724 } else { 1725 /* NEXT, set the pointers to the triangular factors */ 1726 C->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1727 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr = 0; 1728 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr = 0; 1729 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0; 1730 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0; 1731 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices = 0; 1732 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices = 0; 1733 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector = 0; 1734 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = 0; 1735 stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat); 1736 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = handle; 1737 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz = 0; 1738 } 1739 1740 C->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1741 C->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1742 C->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1743 C->ops->mult = MatMult_SeqAIJCUSPARSE; 1744 C->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1745 C->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1746 C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1747 C->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1748 1749 ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1750 1751 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 1752 1753 ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1754 PetscFunctionReturn(0); 1755 } 1756 1757 PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B) 1758 { 1759 PetscErrorCode ierr; 1760 cusparseStatus_t stat; 1761 cusparseHandle_t handle=0; 1762 1763 PetscFunctionBegin; 1764 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 1765 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 1766 1767 if (B->factortype==MAT_FACTOR_NONE) { 1768 /* you cannot check the inode.use flag here since the matrix was just created. 1769 now build a GPU matrix data structure */ 1770 B->spptr = new Mat_SeqAIJCUSPARSE; 1771 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1772 ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0; 1773 ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0; 1774 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1775 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1776 stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat); 1777 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle; 1778 ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0; 1779 } else { 1780 /* NEXT, set the pointers to the triangular factors */ 1781 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1782 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 1783 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1784 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0; 1785 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0; 1786 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0; 1787 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0; 1788 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0; 1789 stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat); 1790 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle; 1791 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0; 1792 } 1793 1794 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1795 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1796 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1797 B->ops->mult = MatMult_SeqAIJCUSPARSE; 1798 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1799 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1800 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1801 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1802 1803 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1804 1805 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 1806 1807 ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1808 PetscFunctionReturn(0); 1809 } 1810 1811 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 1812 { 1813 PetscErrorCode ierr; 1814 1815 PetscFunctionBegin; 1816 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 1817 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);CHKERRQ(ierr); 1818 PetscFunctionReturn(0); 1819 } 1820 1821 /*MC 1822 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1823 1824 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1825 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 1826 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 1827 1828 Options Database Keys: 1829 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1830 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1831 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1832 1833 Level: beginner 1834 1835 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1836 M*/ 1837 1838 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 1839 1840 1841 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 1842 { 1843 PetscErrorCode ierr; 1844 1845 PetscFunctionBegin; 1846 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1847 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1848 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1849 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1850 PetscFunctionReturn(0); 1851 } 1852 1853 1854 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 1855 { 1856 cusparseStatus_t stat; 1857 cusparseHandle_t handle; 1858 1859 PetscFunctionBegin; 1860 if (*cusparsestruct) { 1861 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format); 1862 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format); 1863 delete (*cusparsestruct)->workVector; 1864 if (handle = (*cusparsestruct)->handle) { 1865 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 1866 } 1867 delete *cusparsestruct; 1868 *cusparsestruct = 0; 1869 } 1870 PetscFunctionReturn(0); 1871 } 1872 1873 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 1874 { 1875 PetscFunctionBegin; 1876 if (*mat) { 1877 delete (*mat)->values; 1878 delete (*mat)->column_indices; 1879 delete (*mat)->row_offsets; 1880 delete *mat; 1881 *mat = 0; 1882 } 1883 PetscFunctionReturn(0); 1884 } 1885 1886 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 1887 { 1888 cusparseStatus_t stat; 1889 PetscErrorCode ierr; 1890 1891 PetscFunctionBegin; 1892 if (*trifactor) { 1893 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 1894 if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 1895 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 1896 delete *trifactor; 1897 *trifactor = 0; 1898 } 1899 PetscFunctionReturn(0); 1900 } 1901 1902 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 1903 { 1904 CsrMatrix *mat; 1905 cusparseStatus_t stat; 1906 cudaError_t err; 1907 1908 PetscFunctionBegin; 1909 if (*matstruct) { 1910 if ((*matstruct)->mat) { 1911 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 1912 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 1913 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 1914 } else { 1915 mat = (CsrMatrix*)(*matstruct)->mat; 1916 CsrMatrix_Destroy(&mat); 1917 } 1918 } 1919 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 1920 delete (*matstruct)->cprowIndices; 1921 if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); } 1922 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 1923 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 1924 delete *matstruct; 1925 *matstruct = 0; 1926 } 1927 PetscFunctionReturn(0); 1928 } 1929 1930 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 1931 { 1932 cusparseHandle_t handle; 1933 cusparseStatus_t stat; 1934 1935 PetscFunctionBegin; 1936 if (*trifactors) { 1937 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr); 1938 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr); 1939 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose); 1940 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose); 1941 delete (*trifactors)->rpermIndices; 1942 delete (*trifactors)->cpermIndices; 1943 delete (*trifactors)->workVector; 1944 if (handle = (*trifactors)->handle) { 1945 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 1946 } 1947 delete *trifactors; 1948 *trifactors = 0; 1949 } 1950 PetscFunctionReturn(0); 1951 } 1952 1953