1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_CXX_COMPLEX_FIX 7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 8 9 #include <petscconf.h> 10 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11 #include <../src/mat/impls/sbaij/seq/sbaij.h> 12 #include <../src/vec/vec/impls/dvecimpl.h> 13 #include <petsc/private/vecimpl.h> 14 #undef VecType 15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 19 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 20 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 21 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 22 23 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 24 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 25 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 26 27 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 28 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 29 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 30 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 31 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 32 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 33 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 34 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 35 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 36 37 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 38 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 39 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 40 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 41 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 42 43 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 44 { 45 cusparseStatus_t stat; 46 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 47 48 PetscFunctionBegin; 49 cusparsestruct->stream = stream; 50 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 51 PetscFunctionReturn(0); 52 } 53 54 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 55 { 56 cusparseStatus_t stat; 57 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 58 59 PetscFunctionBegin; 60 if (cusparsestruct->handle != handle) { 61 if (cusparsestruct->handle) { 62 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 63 } 64 cusparsestruct->handle = handle; 65 } 66 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 67 PetscFunctionReturn(0); 68 } 69 70 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 71 { 72 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 73 PetscFunctionBegin; 74 if (cusparsestruct->handle) 75 cusparsestruct->handle = 0; 76 PetscFunctionReturn(0); 77 } 78 79 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 80 { 81 PetscFunctionBegin; 82 *type = MATSOLVERCUSPARSE; 83 PetscFunctionReturn(0); 84 } 85 86 /*MC 87 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 88 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 89 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 90 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 91 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 92 algorithms are not recommended. This class does NOT support direct solver operations. 93 94 Level: beginner 95 96 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 97 M*/ 98 99 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 100 { 101 PetscErrorCode ierr; 102 PetscInt n = A->rmap->n; 103 104 PetscFunctionBegin; 105 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 106 (*B)->factortype = ftype; 107 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 108 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 109 110 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 111 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 112 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 113 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 114 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 115 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 116 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 117 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 118 119 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 120 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 121 PetscFunctionReturn(0); 122 } 123 124 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 125 { 126 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 127 128 PetscFunctionBegin; 129 switch (op) { 130 case MAT_CUSPARSE_MULT: 131 cusparsestruct->format = format; 132 break; 133 case MAT_CUSPARSE_ALL: 134 cusparsestruct->format = format; 135 break; 136 default: 137 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 138 } 139 PetscFunctionReturn(0); 140 } 141 142 /*@ 143 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 144 operation. Only the MatMult operation can use different GPU storage formats 145 for MPIAIJCUSPARSE matrices. 146 Not Collective 147 148 Input Parameters: 149 + A - Matrix of type SEQAIJCUSPARSE 150 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 151 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 152 153 Output Parameter: 154 155 Level: intermediate 156 157 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 158 @*/ 159 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 160 { 161 PetscErrorCode ierr; 162 163 PetscFunctionBegin; 164 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 165 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 166 PetscFunctionReturn(0); 167 } 168 169 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 170 { 171 PetscErrorCode ierr; 172 MatCUSPARSEStorageFormat format; 173 PetscBool flg; 174 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 175 176 PetscFunctionBegin; 177 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 178 if (A->factortype==MAT_FACTOR_NONE) { 179 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 180 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 181 if (flg) { 182 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 183 } 184 } 185 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 186 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 187 if (flg) { 188 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 189 } 190 ierr = PetscOptionsTail();CHKERRQ(ierr); 191 PetscFunctionReturn(0); 192 193 } 194 195 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 196 { 197 PetscErrorCode ierr; 198 199 PetscFunctionBegin; 200 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 201 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 202 PetscFunctionReturn(0); 203 } 204 205 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 206 { 207 PetscErrorCode ierr; 208 209 PetscFunctionBegin; 210 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 211 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 212 PetscFunctionReturn(0); 213 } 214 215 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 216 { 217 PetscErrorCode ierr; 218 219 PetscFunctionBegin; 220 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 221 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 222 PetscFunctionReturn(0); 223 } 224 225 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 226 { 227 PetscErrorCode ierr; 228 229 PetscFunctionBegin; 230 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 231 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 232 PetscFunctionReturn(0); 233 } 234 235 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 236 { 237 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 238 PetscInt n = A->rmap->n; 239 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 240 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 241 cusparseStatus_t stat; 242 const PetscInt *ai = a->i,*aj = a->j,*vi; 243 const MatScalar *aa = a->a,*v; 244 PetscInt *AiLo, *AjLo; 245 PetscScalar *AALo; 246 PetscInt i,nz, nzLower, offset, rowOffset; 247 PetscErrorCode ierr; 248 cudaError_t cerr; 249 250 PetscFunctionBegin; 251 if (!n) PetscFunctionReturn(0); 252 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 253 try { 254 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 255 nzLower=n+ai[n]-ai[1]; 256 257 /* Allocate Space for the lower triangular matrix */ 258 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 259 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 260 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 261 262 /* Fill the lower triangular matrix */ 263 AiLo[0] = (PetscInt) 0; 264 AiLo[n] = nzLower; 265 AjLo[0] = (PetscInt) 0; 266 AALo[0] = (MatScalar) 1.0; 267 v = aa; 268 vi = aj; 269 offset = 1; 270 rowOffset= 1; 271 for (i=1; i<n; i++) { 272 nz = ai[i+1] - ai[i]; 273 /* additional 1 for the term on the diagonal */ 274 AiLo[i] = rowOffset; 275 rowOffset += nz+1; 276 277 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 278 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 279 280 offset += nz; 281 AjLo[offset] = (PetscInt) i; 282 AALo[offset] = (MatScalar) 1.0; 283 offset += 1; 284 285 v += nz; 286 vi += nz; 287 } 288 289 /* allocate space for the triangular factor information */ 290 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 291 292 /* Create the matrix description */ 293 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 294 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 295 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 296 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 297 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 298 299 /* Create the solve analysis information */ 300 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 301 302 /* set the operation */ 303 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 304 305 /* set the matrix */ 306 loTriFactor->csrMat = new CsrMatrix; 307 loTriFactor->csrMat->num_rows = n; 308 loTriFactor->csrMat->num_cols = n; 309 loTriFactor->csrMat->num_entries = nzLower; 310 311 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 312 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 313 314 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 315 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 316 317 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 318 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 319 320 /* perform the solve analysis */ 321 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 322 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 323 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 324 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 325 326 /* assign the pointer. Is this really necessary? */ 327 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 328 329 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 330 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 331 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 332 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 333 } catch(char *ex) { 334 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 335 } 336 } 337 PetscFunctionReturn(0); 338 } 339 340 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 341 { 342 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 343 PetscInt n = A->rmap->n; 344 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 345 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 346 cusparseStatus_t stat; 347 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 348 const MatScalar *aa = a->a,*v; 349 PetscInt *AiUp, *AjUp; 350 PetscScalar *AAUp; 351 PetscInt i,nz, nzUpper, offset; 352 PetscErrorCode ierr; 353 cudaError_t cerr; 354 355 PetscFunctionBegin; 356 if (!n) PetscFunctionReturn(0); 357 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 358 try { 359 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 360 nzUpper = adiag[0]-adiag[n]; 361 362 /* Allocate Space for the upper triangular matrix */ 363 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 364 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 365 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 366 367 /* Fill the upper triangular matrix */ 368 AiUp[0]=(PetscInt) 0; 369 AiUp[n]=nzUpper; 370 offset = nzUpper; 371 for (i=n-1; i>=0; i--) { 372 v = aa + adiag[i+1] + 1; 373 vi = aj + adiag[i+1] + 1; 374 375 /* number of elements NOT on the diagonal */ 376 nz = adiag[i] - adiag[i+1]-1; 377 378 /* decrement the offset */ 379 offset -= (nz+1); 380 381 /* first, set the diagonal elements */ 382 AjUp[offset] = (PetscInt) i; 383 AAUp[offset] = (MatScalar)1./v[nz]; 384 AiUp[i] = AiUp[i+1] - (nz+1); 385 386 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 387 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 388 } 389 390 /* allocate space for the triangular factor information */ 391 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 392 393 /* Create the matrix description */ 394 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 395 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 396 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 397 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 398 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 399 400 /* Create the solve analysis information */ 401 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 402 403 /* set the operation */ 404 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 405 406 /* set the matrix */ 407 upTriFactor->csrMat = new CsrMatrix; 408 upTriFactor->csrMat->num_rows = n; 409 upTriFactor->csrMat->num_cols = n; 410 upTriFactor->csrMat->num_entries = nzUpper; 411 412 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 413 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 414 415 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 416 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 417 418 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 419 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 420 421 /* perform the solve analysis */ 422 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 423 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 424 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 425 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 426 427 /* assign the pointer. Is this really necessary? */ 428 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 429 430 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 431 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 432 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 433 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 434 } catch(char *ex) { 435 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 436 } 437 } 438 PetscFunctionReturn(0); 439 } 440 441 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 442 { 443 PetscErrorCode ierr; 444 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 445 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 446 IS isrow = a->row,iscol = a->icol; 447 PetscBool row_identity,col_identity; 448 const PetscInt *r,*c; 449 PetscInt n = A->rmap->n; 450 451 PetscFunctionBegin; 452 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 453 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 454 455 cusparseTriFactors->workVector = new THRUSTARRAY(n); 456 cusparseTriFactors->nnz=a->nz; 457 458 A->offloadmask = PETSC_OFFLOAD_BOTH; 459 /* lower triangular indices */ 460 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 461 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 462 if (!row_identity) { 463 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 464 cusparseTriFactors->rpermIndices->assign(r, r+n); 465 } 466 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 467 468 /* upper triangular indices */ 469 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 470 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 471 if (!col_identity) { 472 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 473 cusparseTriFactors->cpermIndices->assign(c, c+n); 474 } 475 476 if (!row_identity && !col_identity) { 477 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 478 } else if(!row_identity) { 479 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 480 } else if(!col_identity) { 481 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 482 } 483 484 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 485 PetscFunctionReturn(0); 486 } 487 488 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 489 { 490 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 491 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 492 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 493 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 494 cusparseStatus_t stat; 495 PetscErrorCode ierr; 496 cudaError_t cerr; 497 PetscInt *AiUp, *AjUp; 498 PetscScalar *AAUp; 499 PetscScalar *AALo; 500 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 501 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 502 const PetscInt *ai = b->i,*aj = b->j,*vj; 503 const MatScalar *aa = b->a,*v; 504 505 PetscFunctionBegin; 506 if (!n) PetscFunctionReturn(0); 507 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 508 try { 509 /* Allocate Space for the upper triangular matrix */ 510 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 511 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 512 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 513 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 514 515 /* Fill the upper triangular matrix */ 516 AiUp[0]=(PetscInt) 0; 517 AiUp[n]=nzUpper; 518 offset = 0; 519 for (i=0; i<n; i++) { 520 /* set the pointers */ 521 v = aa + ai[i]; 522 vj = aj + ai[i]; 523 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 524 525 /* first, set the diagonal elements */ 526 AjUp[offset] = (PetscInt) i; 527 AAUp[offset] = (MatScalar)1.0/v[nz]; 528 AiUp[i] = offset; 529 AALo[offset] = (MatScalar)1.0/v[nz]; 530 531 offset+=1; 532 if (nz>0) { 533 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 534 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 535 for (j=offset; j<offset+nz; j++) { 536 AAUp[j] = -AAUp[j]; 537 AALo[j] = AAUp[j]/v[nz]; 538 } 539 offset+=nz; 540 } 541 } 542 543 /* allocate space for the triangular factor information */ 544 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 545 546 /* Create the matrix description */ 547 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 548 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 549 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 550 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 551 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 552 553 /* Create the solve analysis information */ 554 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 555 556 /* set the operation */ 557 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 558 559 /* set the matrix */ 560 upTriFactor->csrMat = new CsrMatrix; 561 upTriFactor->csrMat->num_rows = A->rmap->n; 562 upTriFactor->csrMat->num_cols = A->cmap->n; 563 upTriFactor->csrMat->num_entries = a->nz; 564 565 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 566 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 567 568 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 569 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 570 571 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 572 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 573 574 /* perform the solve analysis */ 575 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 576 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 577 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 578 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 579 580 /* assign the pointer. Is this really necessary? */ 581 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 582 583 /* allocate space for the triangular factor information */ 584 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 585 586 /* Create the matrix description */ 587 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 588 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 589 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 590 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 591 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 592 593 /* Create the solve analysis information */ 594 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 595 596 /* set the operation */ 597 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 598 599 /* set the matrix */ 600 loTriFactor->csrMat = new CsrMatrix; 601 loTriFactor->csrMat->num_rows = A->rmap->n; 602 loTriFactor->csrMat->num_cols = A->cmap->n; 603 loTriFactor->csrMat->num_entries = a->nz; 604 605 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 606 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 607 608 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 609 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 610 611 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 612 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 613 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 614 615 /* perform the solve analysis */ 616 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 617 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 618 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 619 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 620 621 /* assign the pointer. Is this really necessary? */ 622 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 623 624 A->offloadmask = PETSC_OFFLOAD_BOTH; 625 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 626 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 627 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 628 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 629 } catch(char *ex) { 630 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 631 } 632 } 633 PetscFunctionReturn(0); 634 } 635 636 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 637 { 638 PetscErrorCode ierr; 639 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 640 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 641 IS ip = a->row; 642 const PetscInt *rip; 643 PetscBool perm_identity; 644 PetscInt n = A->rmap->n; 645 646 PetscFunctionBegin; 647 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 648 cusparseTriFactors->workVector = new THRUSTARRAY(n); 649 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 650 651 /* lower triangular indices */ 652 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 653 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 654 if (!perm_identity) { 655 IS iip; 656 const PetscInt *irip; 657 658 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 659 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 660 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 661 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 662 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 663 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 664 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 665 ierr = ISDestroy(&iip);CHKERRQ(ierr); 666 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 667 } 668 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 669 PetscFunctionReturn(0); 670 } 671 672 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 673 { 674 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 675 IS isrow = b->row,iscol = b->col; 676 PetscBool row_identity,col_identity; 677 PetscErrorCode ierr; 678 679 PetscFunctionBegin; 680 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 681 /* determine which version of MatSolve needs to be used. */ 682 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 683 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 684 if (row_identity && col_identity) { 685 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 686 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 687 B->ops->matsolve = NULL; 688 B->ops->matsolvetranspose = NULL; 689 } else { 690 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 691 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 692 B->ops->matsolve = NULL; 693 B->ops->matsolvetranspose = NULL; 694 } 695 696 /* get the triangular factors */ 697 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 698 PetscFunctionReturn(0); 699 } 700 701 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 702 { 703 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 704 IS ip = b->row; 705 PetscBool perm_identity; 706 PetscErrorCode ierr; 707 708 PetscFunctionBegin; 709 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 710 711 /* determine which version of MatSolve needs to be used. */ 712 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 713 if (perm_identity) { 714 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 715 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 716 B->ops->matsolve = NULL; 717 B->ops->matsolvetranspose = NULL; 718 } else { 719 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 720 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 721 B->ops->matsolve = NULL; 722 B->ops->matsolvetranspose = NULL; 723 } 724 725 /* get the triangular factors */ 726 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 727 PetscFunctionReturn(0); 728 } 729 730 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 731 { 732 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 733 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 734 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 735 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 736 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 737 cusparseStatus_t stat; 738 cusparseIndexBase_t indexBase; 739 cusparseMatrixType_t matrixType; 740 cusparseFillMode_t fillMode; 741 cusparseDiagType_t diagType; 742 743 PetscFunctionBegin; 744 745 /*********************************************/ 746 /* Now the Transpose of the Lower Tri Factor */ 747 /*********************************************/ 748 749 /* allocate space for the transpose of the lower triangular factor */ 750 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 751 752 /* set the matrix descriptors of the lower triangular factor */ 753 matrixType = cusparseGetMatType(loTriFactor->descr); 754 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 755 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 756 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 757 diagType = cusparseGetMatDiagType(loTriFactor->descr); 758 759 /* Create the matrix description */ 760 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 761 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 762 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 763 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 764 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 765 766 /* Create the solve analysis information */ 767 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 768 769 /* set the operation */ 770 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 771 772 /* allocate GPU space for the CSC of the lower triangular factor*/ 773 loTriFactorT->csrMat = new CsrMatrix; 774 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 775 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 776 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 777 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 778 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 779 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 780 781 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 782 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 783 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 784 loTriFactor->csrMat->values->data().get(), 785 loTriFactor->csrMat->row_offsets->data().get(), 786 loTriFactor->csrMat->column_indices->data().get(), 787 loTriFactorT->csrMat->values->data().get(), 788 loTriFactorT->csrMat->column_indices->data().get(), 789 loTriFactorT->csrMat->row_offsets->data().get(), 790 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 791 792 /* perform the solve analysis on the transposed matrix */ 793 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 794 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 795 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 796 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 797 loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 798 799 /* assign the pointer. Is this really necessary? */ 800 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 801 802 /*********************************************/ 803 /* Now the Transpose of the Upper Tri Factor */ 804 /*********************************************/ 805 806 /* allocate space for the transpose of the upper triangular factor */ 807 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 808 809 /* set the matrix descriptors of the upper triangular factor */ 810 matrixType = cusparseGetMatType(upTriFactor->descr); 811 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 812 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 813 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 814 diagType = cusparseGetMatDiagType(upTriFactor->descr); 815 816 /* Create the matrix description */ 817 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 818 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 819 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 820 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 821 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 822 823 /* Create the solve analysis information */ 824 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 825 826 /* set the operation */ 827 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 828 829 /* allocate GPU space for the CSC of the upper triangular factor*/ 830 upTriFactorT->csrMat = new CsrMatrix; 831 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 832 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 833 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 834 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 835 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 836 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 837 838 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 839 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 840 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 841 upTriFactor->csrMat->values->data().get(), 842 upTriFactor->csrMat->row_offsets->data().get(), 843 upTriFactor->csrMat->column_indices->data().get(), 844 upTriFactorT->csrMat->values->data().get(), 845 upTriFactorT->csrMat->column_indices->data().get(), 846 upTriFactorT->csrMat->row_offsets->data().get(), 847 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 848 849 /* perform the solve analysis on the transposed matrix */ 850 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 851 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 852 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 853 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 854 upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 855 856 /* assign the pointer. Is this really necessary? */ 857 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 858 PetscFunctionReturn(0); 859 } 860 861 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 862 { 863 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 864 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 865 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 866 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 867 cusparseStatus_t stat; 868 cusparseIndexBase_t indexBase; 869 cudaError_t err; 870 PetscErrorCode ierr; 871 872 PetscFunctionBegin; 873 874 /* allocate space for the triangular factor information */ 875 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 876 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 877 indexBase = cusparseGetMatIndexBase(matstruct->descr); 878 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 879 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 880 881 /* set alpha and beta */ 882 err = cudaMalloc((void **)&(matstructT->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 883 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 884 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 885 err = cudaMemcpy(matstructT->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 886 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 887 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 888 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 889 890 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 891 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 892 CsrMatrix *matrixT= new CsrMatrix; 893 matrixT->num_rows = A->cmap->n; 894 matrixT->num_cols = A->rmap->n; 895 matrixT->num_entries = a->nz; 896 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 897 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 898 matrixT->values = new THRUSTARRAY(a->nz); 899 900 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 901 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 902 /* compute the transpose, i.e. the CSC */ 903 indexBase = cusparseGetMatIndexBase(matstruct->descr); 904 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 905 A->cmap->n, matrix->num_entries, 906 matrix->values->data().get(), 907 cusparsestruct->rowoffsets_gpu->data().get(), 908 matrix->column_indices->data().get(), 909 matrixT->values->data().get(), 910 matrixT->column_indices->data().get(), 911 matrixT->row_offsets->data().get(), 912 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 913 /* assign the pointer */ 914 matstructT->mat = matrixT; 915 ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 916 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 917 /* First convert HYB to CSR */ 918 CsrMatrix *temp= new CsrMatrix; 919 temp->num_rows = A->rmap->n; 920 temp->num_cols = A->cmap->n; 921 temp->num_entries = a->nz; 922 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 923 temp->column_indices = new THRUSTINTARRAY32(a->nz); 924 temp->values = new THRUSTARRAY(a->nz); 925 926 927 stat = cusparse_hyb2csr(cusparsestruct->handle, 928 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 929 temp->values->data().get(), 930 temp->row_offsets->data().get(), 931 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 932 933 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 934 CsrMatrix *tempT= new CsrMatrix; 935 tempT->num_rows = A->rmap->n; 936 tempT->num_cols = A->cmap->n; 937 tempT->num_entries = a->nz; 938 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 939 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 940 tempT->values = new THRUSTARRAY(a->nz); 941 942 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 943 temp->num_cols, temp->num_entries, 944 temp->values->data().get(), 945 temp->row_offsets->data().get(), 946 temp->column_indices->data().get(), 947 tempT->values->data().get(), 948 tempT->column_indices->data().get(), 949 tempT->row_offsets->data().get(), 950 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 951 952 /* Last, convert CSC to HYB */ 953 cusparseHybMat_t hybMat; 954 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 955 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 956 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 957 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 958 matstructT->descr, tempT->values->data().get(), 959 tempT->row_offsets->data().get(), 960 tempT->column_indices->data().get(), 961 hybMat, 0, partition);CHKERRCUSPARSE(stat); 962 963 /* assign the pointer */ 964 matstructT->mat = hybMat; 965 ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr); 966 967 /* delete temporaries */ 968 if (tempT) { 969 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 970 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 971 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 972 delete (CsrMatrix*) tempT; 973 } 974 if (temp) { 975 if (temp->values) delete (THRUSTARRAY*) temp->values; 976 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 977 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 978 delete (CsrMatrix*) temp; 979 } 980 } 981 /* assign the compressed row indices */ 982 matstructT->cprowIndices = new THRUSTINTARRAY; 983 matstructT->cprowIndices->resize(A->cmap->n); 984 thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end()); 985 /* assign the pointer */ 986 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 987 PetscFunctionReturn(0); 988 } 989 990 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 991 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 992 { 993 PetscInt n = xx->map->n; 994 const PetscScalar *barray; 995 PetscScalar *xarray; 996 thrust::device_ptr<const PetscScalar> bGPU; 997 thrust::device_ptr<PetscScalar> xGPU; 998 cusparseStatus_t stat; 999 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1000 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1001 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1002 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1003 PetscErrorCode ierr; 1004 cudaError_t cerr; 1005 1006 PetscFunctionBegin; 1007 /* Analyze the matrix and create the transpose ... on the fly */ 1008 if (!loTriFactorT && !upTriFactorT) { 1009 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1010 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1011 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1012 } 1013 1014 /* Get the GPU pointers */ 1015 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1016 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1017 xGPU = thrust::device_pointer_cast(xarray); 1018 bGPU = thrust::device_pointer_cast(barray); 1019 1020 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1021 /* First, reorder with the row permutation */ 1022 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1023 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1024 xGPU); 1025 1026 /* First, solve U */ 1027 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1028 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1029 upTriFactorT->csrMat->values->data().get(), 1030 upTriFactorT->csrMat->row_offsets->data().get(), 1031 upTriFactorT->csrMat->column_indices->data().get(), 1032 upTriFactorT->solveInfo, 1033 xarray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1034 1035 /* Then, solve L */ 1036 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1037 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1038 loTriFactorT->csrMat->values->data().get(), 1039 loTriFactorT->csrMat->row_offsets->data().get(), 1040 loTriFactorT->csrMat->column_indices->data().get(), 1041 loTriFactorT->solveInfo, 1042 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1043 1044 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1045 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1046 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1047 tempGPU->begin()); 1048 1049 /* Copy the temporary to the full solution. */ 1050 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1051 1052 /* restore */ 1053 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1054 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1055 cerr = WaitForGPU();CHKERRCUDA(cerr); 1056 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1057 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1058 PetscFunctionReturn(0); 1059 } 1060 1061 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1062 { 1063 const PetscScalar *barray; 1064 PetscScalar *xarray; 1065 cusparseStatus_t stat; 1066 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1067 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1068 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1069 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1070 PetscErrorCode ierr; 1071 cudaError_t cerr; 1072 1073 PetscFunctionBegin; 1074 /* Analyze the matrix and create the transpose ... on the fly */ 1075 if (!loTriFactorT && !upTriFactorT) { 1076 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1077 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1078 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1079 } 1080 1081 /* Get the GPU pointers */ 1082 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1083 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1084 1085 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1086 /* First, solve U */ 1087 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1088 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1089 upTriFactorT->csrMat->values->data().get(), 1090 upTriFactorT->csrMat->row_offsets->data().get(), 1091 upTriFactorT->csrMat->column_indices->data().get(), 1092 upTriFactorT->solveInfo, 1093 barray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1094 1095 /* Then, solve L */ 1096 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1097 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1098 loTriFactorT->csrMat->values->data().get(), 1099 loTriFactorT->csrMat->row_offsets->data().get(), 1100 loTriFactorT->csrMat->column_indices->data().get(), 1101 loTriFactorT->solveInfo, 1102 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1103 1104 /* restore */ 1105 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1106 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1107 cerr = WaitForGPU();CHKERRCUDA(cerr); 1108 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1109 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1110 PetscFunctionReturn(0); 1111 } 1112 1113 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1114 { 1115 const PetscScalar *barray; 1116 PetscScalar *xarray; 1117 thrust::device_ptr<const PetscScalar> bGPU; 1118 thrust::device_ptr<PetscScalar> xGPU; 1119 cusparseStatus_t stat; 1120 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1121 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1122 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1123 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1124 PetscErrorCode ierr; 1125 cudaError_t cerr; 1126 1127 PetscFunctionBegin; 1128 1129 /* Get the GPU pointers */ 1130 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1131 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1132 xGPU = thrust::device_pointer_cast(xarray); 1133 bGPU = thrust::device_pointer_cast(barray); 1134 1135 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1136 /* First, reorder with the row permutation */ 1137 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1138 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1139 tempGPU->begin()); 1140 1141 /* Next, solve L */ 1142 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1143 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1144 loTriFactor->csrMat->values->data().get(), 1145 loTriFactor->csrMat->row_offsets->data().get(), 1146 loTriFactor->csrMat->column_indices->data().get(), 1147 loTriFactor->solveInfo, 1148 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1149 1150 /* Then, solve U */ 1151 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1152 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1153 upTriFactor->csrMat->values->data().get(), 1154 upTriFactor->csrMat->row_offsets->data().get(), 1155 upTriFactor->csrMat->column_indices->data().get(), 1156 upTriFactor->solveInfo, 1157 xarray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1158 1159 /* Last, reorder with the column permutation */ 1160 thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1161 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1162 xGPU); 1163 1164 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1165 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1166 cerr = WaitForGPU();CHKERRCUDA(cerr); 1167 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1168 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1169 PetscFunctionReturn(0); 1170 } 1171 1172 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1173 { 1174 const PetscScalar *barray; 1175 PetscScalar *xarray; 1176 cusparseStatus_t stat; 1177 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1178 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1179 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1180 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1181 PetscErrorCode ierr; 1182 cudaError_t cerr; 1183 1184 PetscFunctionBegin; 1185 /* Get the GPU pointers */ 1186 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1187 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1188 1189 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1190 /* First, solve L */ 1191 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1192 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1193 loTriFactor->csrMat->values->data().get(), 1194 loTriFactor->csrMat->row_offsets->data().get(), 1195 loTriFactor->csrMat->column_indices->data().get(), 1196 loTriFactor->solveInfo, 1197 barray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1198 1199 /* Next, solve U */ 1200 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1201 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1202 upTriFactor->csrMat->values->data().get(), 1203 upTriFactor->csrMat->row_offsets->data().get(), 1204 upTriFactor->csrMat->column_indices->data().get(), 1205 upTriFactor->solveInfo, 1206 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1207 1208 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1209 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1210 cerr = WaitForGPU();CHKERRCUDA(cerr); 1211 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1212 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1213 PetscFunctionReturn(0); 1214 } 1215 1216 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1217 { 1218 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1219 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1220 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1221 PetscInt m = A->rmap->n,*ii,*ridx; 1222 PetscErrorCode ierr; 1223 cusparseStatus_t stat; 1224 cudaError_t err; 1225 1226 PetscFunctionBegin; 1227 if (A->boundtocpu) PetscFunctionReturn(0); 1228 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1229 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1230 if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1231 /* Copy values only */ 1232 CsrMatrix *mat,*matT; 1233 mat = (CsrMatrix*)cusparsestruct->mat->mat; 1234 mat->values->assign(a->a, a->a+a->nz); 1235 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1236 1237 /* Update matT when it was built before */ 1238 if (cusparsestruct->matTranspose) { 1239 cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr); 1240 matT = (CsrMatrix*)cusparsestruct->matTranspose->mat; 1241 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1242 A->cmap->n, mat->num_entries, 1243 mat->values->data().get(), 1244 cusparsestruct->rowoffsets_gpu->data().get(), 1245 mat->column_indices->data().get(), 1246 matT->values->data().get(), 1247 matT->column_indices->data().get(), 1248 matT->row_offsets->data().get(), 1249 CUSPARSE_ACTION_NUMERIC,indexBase);CHKERRCUSPARSE(stat); 1250 } 1251 } else { 1252 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1253 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr); 1254 delete cusparsestruct->workVector; 1255 delete cusparsestruct->rowoffsets_gpu; 1256 try { 1257 cusparsestruct->nonzerorow=0; 1258 for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0); 1259 1260 if (a->compressedrow.use) { 1261 m = a->compressedrow.nrows; 1262 ii = a->compressedrow.i; 1263 ridx = a->compressedrow.rindex; 1264 } else { 1265 /* Forcing compressed row on the GPU */ 1266 int k=0; 1267 ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr); 1268 ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr); 1269 ii[0]=0; 1270 for (int j = 0; j<m; j++) { 1271 if ((a->i[j+1]-a->i[j])>0) { 1272 ii[k] = a->i[j]; 1273 ridx[k]= j; 1274 k++; 1275 } 1276 } 1277 ii[cusparsestruct->nonzerorow] = a->nz; 1278 m = cusparsestruct->nonzerorow; 1279 } 1280 1281 /* allocate space for the triangular factor information */ 1282 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1283 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1284 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1285 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1286 1287 err = cudaMalloc((void **)&(matstruct->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 1288 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1289 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1290 err = cudaMemcpy(matstruct->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1291 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1292 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1293 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1294 1295 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1296 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1297 /* set the matrix */ 1298 CsrMatrix *matrix= new CsrMatrix; 1299 matrix->num_rows = m; 1300 matrix->num_cols = A->cmap->n; 1301 matrix->num_entries = a->nz; 1302 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1303 matrix->row_offsets->assign(ii, ii + m+1); 1304 1305 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1306 matrix->column_indices->assign(a->j, a->j+a->nz); 1307 1308 matrix->values = new THRUSTARRAY(a->nz); 1309 matrix->values->assign(a->a, a->a+a->nz); 1310 1311 /* assign the pointer */ 1312 matstruct->mat = matrix; 1313 1314 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1315 CsrMatrix *matrix= new CsrMatrix; 1316 matrix->num_rows = m; 1317 matrix->num_cols = A->cmap->n; 1318 matrix->num_entries = a->nz; 1319 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1320 matrix->row_offsets->assign(ii, ii + m+1); 1321 1322 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1323 matrix->column_indices->assign(a->j, a->j+a->nz); 1324 1325 matrix->values = new THRUSTARRAY(a->nz); 1326 matrix->values->assign(a->a, a->a+a->nz); 1327 1328 cusparseHybMat_t hybMat; 1329 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1330 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1331 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1332 stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1333 matstruct->descr, matrix->values->data().get(), 1334 matrix->row_offsets->data().get(), 1335 matrix->column_indices->data().get(), 1336 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1337 /* assign the pointer */ 1338 matstruct->mat = hybMat; 1339 1340 if (matrix) { 1341 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1342 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1343 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1344 delete (CsrMatrix*)matrix; 1345 } 1346 } 1347 1348 /* assign the compressed row indices */ 1349 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1350 matstruct->cprowIndices->assign(ridx,ridx+m); 1351 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+m*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1352 1353 /* assign the pointer */ 1354 cusparsestruct->mat = matstruct; 1355 1356 if (!a->compressedrow.use) { 1357 ierr = PetscFree(ii);CHKERRQ(ierr); 1358 ierr = PetscFree(ridx);CHKERRQ(ierr); 1359 } 1360 cusparsestruct->workVector = new THRUSTARRAY(m); 1361 } catch(char *ex) { 1362 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1363 } 1364 cusparsestruct->nonzerostate = A->nonzerostate; 1365 } 1366 err = WaitForGPU();CHKERRCUDA(err); 1367 A->offloadmask = PETSC_OFFLOAD_BOTH; 1368 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1369 } 1370 PetscFunctionReturn(0); 1371 } 1372 1373 struct VecCUDAPlusEquals 1374 { 1375 template <typename Tuple> 1376 __host__ __device__ 1377 void operator()(Tuple t) 1378 { 1379 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1380 } 1381 }; 1382 1383 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1384 { 1385 PetscErrorCode ierr; 1386 1387 PetscFunctionBegin; 1388 ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr); 1389 PetscFunctionReturn(0); 1390 } 1391 1392 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1393 { 1394 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1395 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1396 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1397 const PetscScalar *xarray; 1398 PetscScalar *yarray; 1399 PetscErrorCode ierr; 1400 cudaError_t cerr; 1401 cusparseStatus_t stat; 1402 1403 PetscFunctionBegin; 1404 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1405 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1406 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1407 if (!matstructT) { 1408 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1409 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1410 } 1411 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1412 ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1413 if (yy->map->n) { 1414 PetscInt n = yy->map->n; 1415 cudaError_t err; 1416 err = cudaMemset(yarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err); /* hack to fix numerical errors from reading output vector yy, apparently */ 1417 } 1418 1419 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1420 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1421 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1422 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1423 mat->num_rows, mat->num_cols, 1424 mat->num_entries, matstructT->alpha, matstructT->descr, 1425 mat->values->data().get(), mat->row_offsets->data().get(), 1426 mat->column_indices->data().get(), xarray, matstructT->beta_zero, 1427 yarray);CHKERRCUSPARSE(stat); 1428 } else { 1429 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1430 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1431 matstructT->alpha, matstructT->descr, hybMat, 1432 xarray, matstructT->beta_zero, 1433 yarray);CHKERRCUSPARSE(stat); 1434 } 1435 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1436 ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1437 cerr = WaitForGPU();CHKERRCUDA(cerr); 1438 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1439 ierr = PetscLogGpuFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1440 PetscFunctionReturn(0); 1441 } 1442 1443 1444 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1445 { 1446 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1447 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1448 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 1449 const PetscScalar *xarray; 1450 PetscScalar *zarray,*dptr,*beta; 1451 PetscErrorCode ierr; 1452 cudaError_t cerr; 1453 cusparseStatus_t stat; 1454 PetscBool cmpr; /* if the matrix has been compressed (zero rows) */ 1455 1456 PetscFunctionBegin; 1457 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1458 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1459 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1460 try { 1461 cmpr = (PetscBool)(cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n)); 1462 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1463 if (yy && !cmpr) { /* MatMultAdd with noncompressed storage -> need uptodate zz vector */ 1464 ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr); 1465 } else { 1466 ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1467 } 1468 dptr = cmpr ? zarray : cusparsestruct->workVector->data().get(); 1469 beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero; 1470 1471 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1472 /* csr_spmv is multiply add */ 1473 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1474 /* here we need to be careful to set the number of rows in the multiply to the 1475 number of compressed rows in the matrix ... which is equivalent to the 1476 size of the workVector */ 1477 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1478 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1479 mat->num_rows, mat->num_cols, 1480 mat->num_entries, matstruct->alpha, matstruct->descr, 1481 mat->values->data().get(), mat->row_offsets->data().get(), 1482 mat->column_indices->data().get(), xarray, beta, 1483 dptr);CHKERRCUSPARSE(stat); 1484 } else { 1485 if (cusparsestruct->workVector->size()) { 1486 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1487 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1488 matstruct->alpha, matstruct->descr, hybMat, 1489 xarray, beta, 1490 dptr);CHKERRCUSPARSE(stat); 1491 } 1492 } 1493 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1494 1495 if (yy) { 1496 if (dptr != zarray) { 1497 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1498 } else if (zz != yy) { 1499 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); 1500 } 1501 } else if (dptr != zarray) { 1502 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 1503 } 1504 /* scatter the data from the temporary into the full vector with a += operation */ 1505 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1506 if (dptr != zarray) { 1507 thrust::device_ptr<PetscScalar> zptr; 1508 1509 zptr = thrust::device_pointer_cast(zarray); 1510 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 1511 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1512 VecCUDAPlusEquals()); 1513 } 1514 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1515 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1516 if (yy && !cmpr) { 1517 ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr); 1518 } else { 1519 ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1520 } 1521 } catch(char *ex) { 1522 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1523 } 1524 cerr = WaitForGPU();CHKERRCUDA(cerr); 1525 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1526 PetscFunctionReturn(0); 1527 } 1528 1529 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1530 { 1531 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1532 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1533 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1534 const PetscScalar *xarray; 1535 PetscScalar *zarray,*beta; 1536 PetscErrorCode ierr; 1537 cudaError_t cerr; 1538 cusparseStatus_t stat; 1539 1540 PetscFunctionBegin; 1541 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1542 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1543 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1544 if (!matstructT) { 1545 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1546 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1547 } 1548 1549 /* Note unlike Mat, MatTranspose uses non-compressed row storage */ 1550 try { 1551 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1552 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1553 ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr); 1554 beta = (yy == zz) ? matstructT->beta_one : matstructT->beta_zero; 1555 1556 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1557 /* multiply add with matrix transpose */ 1558 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1559 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1560 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1561 mat->num_rows, mat->num_cols, 1562 mat->num_entries, matstructT->alpha, matstructT->descr, 1563 mat->values->data().get(), mat->row_offsets->data().get(), 1564 mat->column_indices->data().get(), xarray, beta, 1565 zarray);CHKERRCUSPARSE(stat); 1566 } else { 1567 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1568 if (cusparsestruct->workVector->size()) { 1569 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1570 matstructT->alpha, matstructT->descr, hybMat, 1571 xarray, beta, 1572 zarray);CHKERRCUSPARSE(stat); 1573 } 1574 } 1575 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1576 1577 if (zz != yy) {ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr);} 1578 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1579 ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr); 1580 } catch(char *ex) { 1581 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1582 } 1583 cerr = WaitForGPU();CHKERRCUDA(cerr); 1584 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1585 PetscFunctionReturn(0); 1586 } 1587 1588 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1589 { 1590 PetscErrorCode ierr; 1591 1592 PetscFunctionBegin; 1593 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1594 if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 1595 if (A->factortype == MAT_FACTOR_NONE) { 1596 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1597 } 1598 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1599 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1600 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1601 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1602 PetscFunctionReturn(0); 1603 } 1604 1605 /* --------------------------------------------------------------------------------*/ 1606 /*@ 1607 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1608 (the default parallel PETSc format). This matrix will ultimately pushed down 1609 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1610 assembly performance the user should preallocate the matrix storage by setting 1611 the parameter nz (or the array nnz). By setting these parameters accurately, 1612 performance during matrix assembly can be increased by more than a factor of 50. 1613 1614 Collective 1615 1616 Input Parameters: 1617 + comm - MPI communicator, set to PETSC_COMM_SELF 1618 . m - number of rows 1619 . n - number of columns 1620 . nz - number of nonzeros per row (same for all rows) 1621 - nnz - array containing the number of nonzeros in the various rows 1622 (possibly different for each row) or NULL 1623 1624 Output Parameter: 1625 . A - the matrix 1626 1627 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1628 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1629 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1630 1631 Notes: 1632 If nnz is given then nz is ignored 1633 1634 The AIJ format (also called the Yale sparse matrix format or 1635 compressed row storage), is fully compatible with standard Fortran 77 1636 storage. That is, the stored row and column indices can begin at 1637 either one (as in Fortran) or zero. See the users' manual for details. 1638 1639 Specify the preallocated storage with either nz or nnz (not both). 1640 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1641 allocation. For large problems you MUST preallocate memory or you 1642 will get TERRIBLE performance, see the users' manual chapter on matrices. 1643 1644 By default, this format uses inodes (identical nodes) when possible, to 1645 improve numerical efficiency of matrix-vector products and solves. We 1646 search for consecutive rows with the same nonzero structure, thereby 1647 reusing matrix information to achieve increased efficiency. 1648 1649 Level: intermediate 1650 1651 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1652 @*/ 1653 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1654 { 1655 PetscErrorCode ierr; 1656 1657 PetscFunctionBegin; 1658 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1659 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1660 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1661 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1662 PetscFunctionReturn(0); 1663 } 1664 1665 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1666 { 1667 PetscErrorCode ierr; 1668 1669 PetscFunctionBegin; 1670 if (A->factortype==MAT_FACTOR_NONE) { 1671 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) { 1672 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 1673 } 1674 } else { 1675 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 1676 } 1677 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1678 PetscFunctionReturn(0); 1679 } 1680 1681 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 1682 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 1683 { 1684 PetscErrorCode ierr; 1685 Mat C; 1686 cusparseStatus_t stat; 1687 cusparseHandle_t handle=0; 1688 1689 PetscFunctionBegin; 1690 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 1691 C = *B; 1692 ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr); 1693 ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr); 1694 1695 /* inject CUSPARSE-specific stuff */ 1696 if (C->factortype==MAT_FACTOR_NONE) { 1697 /* you cannot check the inode.use flag here since the matrix was just created. 1698 now build a GPU matrix data structure */ 1699 C->spptr = new Mat_SeqAIJCUSPARSE; 1700 ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat = 0; 1701 ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0; 1702 ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector = 0; 1703 ((Mat_SeqAIJCUSPARSE*)C->spptr)->rowoffsets_gpu = 0; 1704 ((Mat_SeqAIJCUSPARSE*)C->spptr)->format = MAT_CUSPARSE_CSR; 1705 ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream = 0; 1706 stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat); 1707 ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle = handle; 1708 ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0; 1709 } else { 1710 /* NEXT, set the pointers to the triangular factors */ 1711 C->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1712 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr = 0; 1713 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr = 0; 1714 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0; 1715 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0; 1716 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices = 0; 1717 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices = 0; 1718 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector = 0; 1719 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = 0; 1720 stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat); 1721 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = handle; 1722 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz = 0; 1723 } 1724 1725 C->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1726 C->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1727 C->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1728 C->ops->mult = MatMult_SeqAIJCUSPARSE; 1729 C->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1730 C->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1731 C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1732 C->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1733 C->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 1734 1735 ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1736 1737 C->boundtocpu = PETSC_FALSE; 1738 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 1739 1740 ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1741 PetscFunctionReturn(0); 1742 } 1743 1744 1745 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 1746 { 1747 PetscFunctionBegin; 1748 /* Currently, there is no case in which an AIJCUSPARSE matrix ever has its offloadmask set to PETS_OFFLOAD_GPU. 1749 If this changes, we need to implement a routine to update the CPU (host) version of the matrix from the GPU one. 1750 Right now, for safety we simply check for PETSC_OFFLOAD_GPU and have MatBindToCPU() do nothing in this case. 1751 TODO: Add MatAIJCUSPARSECopyFromGPU() and make MatBindToCPU() functional for AIJCUSPARSE matries; 1752 can follow the example of MatBindToCPU_SeqAIJViennaCL(). */ 1753 if (A->offloadmask == PETSC_OFFLOAD_GPU) PetscFunctionReturn(0); 1754 if (flg) { 1755 A->ops->mult = MatMult_SeqAIJ; 1756 A->ops->multadd = MatMultAdd_SeqAIJ; 1757 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 1758 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 1759 A->ops->assemblyend = MatAssemblyEnd_SeqAIJ; 1760 A->ops->duplicate = MatDuplicate_SeqAIJ; 1761 } else { 1762 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1763 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1764 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1765 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1766 A->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1767 A->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1768 A->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1769 } 1770 A->boundtocpu = flg; 1771 PetscFunctionReturn(0); 1772 } 1773 1774 PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B) 1775 { 1776 PetscErrorCode ierr; 1777 cusparseStatus_t stat; 1778 cusparseHandle_t handle=0; 1779 1780 PetscFunctionBegin; 1781 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 1782 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 1783 1784 if (B->factortype==MAT_FACTOR_NONE) { 1785 /* you cannot check the inode.use flag here since the matrix was just created. 1786 now build a GPU matrix data structure */ 1787 B->spptr = new Mat_SeqAIJCUSPARSE; 1788 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1789 ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0; 1790 ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0; 1791 ((Mat_SeqAIJCUSPARSE*)B->spptr)->rowoffsets_gpu = 0; 1792 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1793 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1794 stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat); 1795 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle; 1796 ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0; 1797 } else { 1798 /* NEXT, set the pointers to the triangular factors */ 1799 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1800 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 1801 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1802 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0; 1803 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0; 1804 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0; 1805 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0; 1806 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0; 1807 stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat); 1808 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle; 1809 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0; 1810 } 1811 1812 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1813 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1814 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1815 B->ops->mult = MatMult_SeqAIJCUSPARSE; 1816 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1817 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1818 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1819 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1820 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 1821 1822 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1823 1824 B->boundtocpu = PETSC_FALSE; 1825 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 1826 1827 ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1828 PetscFunctionReturn(0); 1829 } 1830 1831 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 1832 { 1833 PetscErrorCode ierr; 1834 1835 PetscFunctionBegin; 1836 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 1837 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);CHKERRQ(ierr); 1838 PetscFunctionReturn(0); 1839 } 1840 1841 /*MC 1842 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1843 1844 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1845 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 1846 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 1847 1848 Options Database Keys: 1849 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1850 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1851 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1852 1853 Level: beginner 1854 1855 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1856 M*/ 1857 1858 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 1859 1860 1861 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 1862 { 1863 PetscErrorCode ierr; 1864 1865 PetscFunctionBegin; 1866 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1867 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1868 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1869 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1870 PetscFunctionReturn(0); 1871 } 1872 1873 1874 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 1875 { 1876 cusparseStatus_t stat; 1877 cusparseHandle_t handle; 1878 1879 PetscFunctionBegin; 1880 if (*cusparsestruct) { 1881 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format); 1882 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format); 1883 delete (*cusparsestruct)->workVector; 1884 delete (*cusparsestruct)->rowoffsets_gpu; 1885 if (handle = (*cusparsestruct)->handle) { 1886 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 1887 } 1888 delete *cusparsestruct; 1889 *cusparsestruct = 0; 1890 } 1891 PetscFunctionReturn(0); 1892 } 1893 1894 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 1895 { 1896 PetscFunctionBegin; 1897 if (*mat) { 1898 delete (*mat)->values; 1899 delete (*mat)->column_indices; 1900 delete (*mat)->row_offsets; 1901 delete *mat; 1902 *mat = 0; 1903 } 1904 PetscFunctionReturn(0); 1905 } 1906 1907 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 1908 { 1909 cusparseStatus_t stat; 1910 PetscErrorCode ierr; 1911 1912 PetscFunctionBegin; 1913 if (*trifactor) { 1914 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 1915 if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 1916 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 1917 delete *trifactor; 1918 *trifactor = 0; 1919 } 1920 PetscFunctionReturn(0); 1921 } 1922 1923 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 1924 { 1925 CsrMatrix *mat; 1926 cusparseStatus_t stat; 1927 cudaError_t err; 1928 1929 PetscFunctionBegin; 1930 if (*matstruct) { 1931 if ((*matstruct)->mat) { 1932 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 1933 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 1934 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 1935 } else { 1936 mat = (CsrMatrix*)(*matstruct)->mat; 1937 CsrMatrix_Destroy(&mat); 1938 } 1939 } 1940 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 1941 delete (*matstruct)->cprowIndices; 1942 if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); } 1943 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 1944 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 1945 delete *matstruct; 1946 *matstruct = 0; 1947 } 1948 PetscFunctionReturn(0); 1949 } 1950 1951 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 1952 { 1953 cusparseHandle_t handle; 1954 cusparseStatus_t stat; 1955 1956 PetscFunctionBegin; 1957 if (*trifactors) { 1958 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr); 1959 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr); 1960 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose); 1961 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose); 1962 delete (*trifactors)->rpermIndices; 1963 delete (*trifactors)->cpermIndices; 1964 delete (*trifactors)->workVector; 1965 if (handle = (*trifactors)->handle) { 1966 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 1967 } 1968 delete *trifactors; 1969 *trifactors = 0; 1970 } 1971 PetscFunctionReturn(0); 1972 } 1973 1974