1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 6 #include "petscconf.h" 7 #include "../src/mat/impls/aij/seq/aij.h" /*I "petscmat.h" I*/ 8 #include <../src/mat/impls/sbaij/seq/sbaij.h> 9 #include "../src/vec/vec/impls/dvecimpl.h" 10 #include "petsc-private/vecimpl.h" 11 #undef VecType 12 #include "cusparsematimpl.h" 13 14 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 15 16 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 17 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 18 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 19 20 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 21 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 22 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 23 24 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 26 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 28 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat); 29 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 30 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 31 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 32 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 33 34 #undef __FUNCT__ 35 #define __FUNCT__ "MatCUSPARSESetStream" 36 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 37 { 38 cusparseStatus_t stat; 39 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 40 41 PetscFunctionBegin; 42 cusparsestruct->stream = stream; 43 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSP(stat); 44 PetscFunctionReturn(0); 45 } 46 47 #undef __FUNCT__ 48 #define __FUNCT__ "MatCUSPARSESetHandle" 49 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 50 { 51 cusparseStatus_t stat; 52 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 53 54 PetscFunctionBegin; 55 if (cusparsestruct->handle) 56 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); 57 cusparsestruct->handle = handle; 58 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat); 59 PetscFunctionReturn(0); 60 } 61 62 #undef __FUNCT__ 63 #define __FUNCT__ "MatCUSPARSEClearHandle" 64 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 65 { 66 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 67 PetscFunctionBegin; 68 if (cusparsestruct->handle) 69 cusparsestruct->handle = 0; 70 PetscFunctionReturn(0); 71 } 72 73 #undef __FUNCT__ 74 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse" 75 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type) 76 { 77 PetscFunctionBegin; 78 *type = MATSOLVERCUSPARSE; 79 PetscFunctionReturn(0); 80 } 81 82 /*MC 83 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 84 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 85 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 86 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 87 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 88 algorithms are not recommended. This class does NOT support direct solver operations. 89 90 Consult CUSPARSE documentation for more information about the matrix storage formats 91 which correspond to the options database keys below. 92 93 Options Database Keys: 94 . -mat_cusparse_solve_storage_format csr - sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 95 96 Level: beginner 97 98 .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 99 M*/ 100 101 #undef __FUNCT__ 102 #define __FUNCT__ "MatGetFactor_seqaij_cusparse" 103 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B) 104 { 105 PetscErrorCode ierr; 106 PetscInt n = A->rmap->n; 107 108 PetscFunctionBegin; 109 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 110 (*B)->factortype = ftype; 111 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 112 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 113 114 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 115 ierr = MatSetBlockSizes(*B,A->rmap->bs,A->cmap->bs);CHKERRQ(ierr); 116 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 117 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 118 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 119 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 120 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 121 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 122 123 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 124 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr); 125 PetscFunctionReturn(0); 126 } 127 128 #undef __FUNCT__ 129 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE" 130 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 131 { 132 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 133 134 PetscFunctionBegin; 135 #if CUDA_VERSION>=4020 136 switch (op) { 137 case MAT_CUSPARSE_MULT: 138 cusparsestruct->format = format; 139 break; 140 case MAT_CUSPARSE_SOLVE: 141 cusparseMatSolveStorageFormat = format; 142 break; 143 case MAT_CUSPARSE_ALL: 144 cusparsestruct->format = format; 145 cusparseMatSolveStorageFormat = format; 146 break; 147 default: 148 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL are currently supported.",op); 149 } 150 #else 151 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) 152 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later."); 153 #endif 154 PetscFunctionReturn(0); 155 } 156 157 /*@ 158 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 159 operation. Only the MatMult operation can use different GPU storage formats 160 for MPIAIJCUSPARSE matrices. 161 Not Collective 162 163 Input Parameters: 164 + A - Matrix of type SEQAIJCUSPARSE 165 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 166 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 167 168 Output Parameter: 169 170 Level: intermediate 171 172 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 173 @*/ 174 #undef __FUNCT__ 175 #define __FUNCT__ "MatCUSPARSESetFormat" 176 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 177 { 178 PetscErrorCode ierr; 179 180 PetscFunctionBegin; 181 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 182 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 183 PetscFunctionReturn(0); 184 } 185 186 #undef __FUNCT__ 187 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE" 188 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A) 189 { 190 PetscErrorCode ierr; 191 MatCUSPARSEStorageFormat format; 192 PetscBool flg; 193 194 PetscFunctionBegin; 195 ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr); 196 ierr = PetscObjectOptionsBegin((PetscObject)A); 197 if (A->factortype==MAT_FACTOR_NONE) { 198 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 199 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 200 if (flg) { 201 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 202 } 203 } else { 204 ierr = PetscOptionsEnum("-mat_cusparse_solve_storage_format","sets storage format of (seq)aijcusparse gpu matrices for TriSolve", 205 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 206 if (flg) { 207 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_SOLVE,format);CHKERRQ(ierr); 208 } 209 } 210 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 211 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 212 if (flg) { 213 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 214 } 215 ierr = PetscOptionsEnd();CHKERRQ(ierr); 216 PetscFunctionReturn(0); 217 218 } 219 220 #undef __FUNCT__ 221 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE" 222 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 223 { 224 PetscErrorCode ierr; 225 226 PetscFunctionBegin; 227 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 228 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 229 PetscFunctionReturn(0); 230 } 231 232 #undef __FUNCT__ 233 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE" 234 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 235 { 236 PetscErrorCode ierr; 237 238 PetscFunctionBegin; 239 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 240 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 241 PetscFunctionReturn(0); 242 } 243 244 #undef __FUNCT__ 245 #define __FUNCT__ "MatICCFactorSymbolic_SeqAIJCUSPARSE" 246 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 247 { 248 PetscErrorCode ierr; 249 250 PetscFunctionBegin; 251 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 252 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 253 PetscFunctionReturn(0); 254 } 255 256 #undef __FUNCT__ 257 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqAIJCUSPARSE" 258 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 259 { 260 PetscErrorCode ierr; 261 262 PetscFunctionBegin; 263 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 264 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 265 PetscFunctionReturn(0); 266 } 267 268 #undef __FUNCT__ 269 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILULowerTriMatrix" 270 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 271 { 272 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 273 PetscInt n = A->rmap->n; 274 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 275 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 276 cusparseStatus_t stat; 277 const PetscInt *ai = a->i,*aj = a->j,*vi; 278 const MatScalar *aa = a->a,*v; 279 PetscInt *AiLo, *AjLo; 280 PetscScalar *AALo; 281 PetscInt i,nz, nzLower, offset, rowOffset; 282 PetscErrorCode ierr; 283 284 PetscFunctionBegin; 285 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 286 try { 287 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 288 nzLower=n+ai[n]-ai[1]; 289 290 /* Allocate Space for the lower triangular matrix */ 291 ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 292 ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr); 293 ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr); 294 295 /* Fill the lower triangular matrix */ 296 AiLo[0] = (PetscInt) 0; 297 AiLo[n] = nzLower; 298 AjLo[0] = (PetscInt) 0; 299 AALo[0] = (MatScalar) 1.0; 300 v = aa; 301 vi = aj; 302 offset = 1; 303 rowOffset= 1; 304 for (i=1; i<n; i++) { 305 nz = ai[i+1] - ai[i]; 306 /* additional 1 for the term on the diagonal */ 307 AiLo[i] = rowOffset; 308 rowOffset += nz+1; 309 310 ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 311 ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 312 313 offset += nz; 314 AjLo[offset] = (PetscInt) i; 315 AALo[offset] = (MatScalar) 1.0; 316 offset += 1; 317 318 v += nz; 319 vi += nz; 320 } 321 322 /* allocate space for the triangular factor information */ 323 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 324 325 /* Create the matrix description */ 326 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat); 327 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 328 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 329 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat); 330 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat); 331 332 /* Create the solve analysis information */ 333 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat); 334 335 /* set the operation */ 336 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 337 338 /* set the matrix */ 339 loTriFactor->csrMat = new CsrMatrix; 340 loTriFactor->csrMat->num_rows = n; 341 loTriFactor->csrMat->num_cols = n; 342 loTriFactor->csrMat->num_entries = nzLower; 343 344 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 345 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 346 347 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 348 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 349 350 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 351 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 352 353 /* perform the solve analysis */ 354 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 355 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 356 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 357 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat); 358 359 /* assign the pointer. Is this really necessary? */ 360 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 361 362 ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr); 363 ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr); 364 ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 365 } catch(char *ex) { 366 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 367 } 368 } 369 PetscFunctionReturn(0); 370 } 371 372 #undef __FUNCT__ 373 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILUUpperTriMatrix" 374 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 375 { 376 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 377 PetscInt n = A->rmap->n; 378 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 379 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 380 cusparseStatus_t stat; 381 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 382 const MatScalar *aa = a->a,*v; 383 PetscInt *AiUp, *AjUp; 384 PetscScalar *AAUp; 385 PetscInt i,nz, nzUpper, offset; 386 PetscErrorCode ierr; 387 388 PetscFunctionBegin; 389 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 390 try { 391 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 392 nzUpper = adiag[0]-adiag[n]; 393 394 /* Allocate Space for the upper triangular matrix */ 395 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 396 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 397 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 398 399 /* Fill the upper triangular matrix */ 400 AiUp[0]=(PetscInt) 0; 401 AiUp[n]=nzUpper; 402 offset = nzUpper; 403 for (i=n-1; i>=0; i--) { 404 v = aa + adiag[i+1] + 1; 405 vi = aj + adiag[i+1] + 1; 406 407 /* number of elements NOT on the diagonal */ 408 nz = adiag[i] - adiag[i+1]-1; 409 410 /* decrement the offset */ 411 offset -= (nz+1); 412 413 /* first, set the diagonal elements */ 414 AjUp[offset] = (PetscInt) i; 415 AAUp[offset] = 1./v[nz]; 416 AiUp[i] = AiUp[i+1] - (nz+1); 417 418 ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 419 ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 420 } 421 422 /* allocate space for the triangular factor information */ 423 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 424 425 /* Create the matrix description */ 426 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat); 427 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 428 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 429 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 430 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 431 432 /* Create the solve analysis information */ 433 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat); 434 435 /* set the operation */ 436 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 437 438 /* set the matrix */ 439 upTriFactor->csrMat = new CsrMatrix; 440 upTriFactor->csrMat->num_rows = n; 441 upTriFactor->csrMat->num_cols = n; 442 upTriFactor->csrMat->num_entries = nzUpper; 443 444 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 445 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 446 447 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 448 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 449 450 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 451 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 452 453 /* perform the solve analysis */ 454 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 455 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 456 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 457 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat); 458 459 /* assign the pointer. Is this really necessary? */ 460 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 461 462 ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 463 ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 464 ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 465 } catch(char *ex) { 466 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 467 } 468 } 469 PetscFunctionReturn(0); 470 } 471 472 #undef __FUNCT__ 473 #define __FUNCT__ "MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU" 474 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 475 { 476 PetscErrorCode ierr; 477 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 478 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 479 IS isrow = a->row,iscol = a->icol; 480 PetscBool row_identity,col_identity; 481 const PetscInt *r,*c; 482 PetscInt n = A->rmap->n; 483 484 PetscFunctionBegin; 485 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 486 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 487 488 cusparseTriFactors->workVector = new THRUSTARRAY; 489 cusparseTriFactors->workVector->resize(n); 490 cusparseTriFactors->nnz=a->nz; 491 492 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 493 /*lower triangular indices */ 494 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 495 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 496 if (!row_identity) { 497 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 498 cusparseTriFactors->rpermIndices->assign(r, r+n); 499 } 500 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 501 502 /*upper triangular indices */ 503 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 504 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 505 if (!col_identity) { 506 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 507 cusparseTriFactors->cpermIndices->assign(c, c+n); 508 } 509 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 510 PetscFunctionReturn(0); 511 } 512 513 #undef __FUNCT__ 514 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildICCTriMatrices" 515 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 516 { 517 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 518 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 519 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 520 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 521 cusparseStatus_t stat; 522 PetscErrorCode ierr; 523 PetscInt *AiUp, *AjUp; 524 PetscScalar *AAUp; 525 PetscScalar *AALo; 526 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 527 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 528 const PetscInt *ai = b->i,*aj = b->j,*vj; 529 const MatScalar *aa = b->a,*v; 530 531 PetscFunctionBegin; 532 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 533 try { 534 /* Allocate Space for the upper triangular matrix */ 535 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 536 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 537 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 538 ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 539 540 /* Fill the upper triangular matrix */ 541 AiUp[0]=(PetscInt) 0; 542 AiUp[n]=nzUpper; 543 offset = 0; 544 for (i=0; i<n; i++) { 545 /* set the pointers */ 546 v = aa + ai[i]; 547 vj = aj + ai[i]; 548 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 549 550 /* first, set the diagonal elements */ 551 AjUp[offset] = (PetscInt) i; 552 AAUp[offset] = 1.0/v[nz]; 553 AiUp[i] = offset; 554 AALo[offset] = 1.0/v[nz]; 555 556 offset+=1; 557 if (nz>0) { 558 ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr); 559 ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 560 for (j=offset; j<offset+nz; j++) { 561 AAUp[j] = -AAUp[j]; 562 AALo[j] = AAUp[j]/v[nz]; 563 } 564 offset+=nz; 565 } 566 } 567 568 /* allocate space for the triangular factor information */ 569 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 570 571 /* Create the matrix description */ 572 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat); 573 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 574 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 575 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 576 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat); 577 578 /* Create the solve analysis information */ 579 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat); 580 581 /* set the operation */ 582 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 583 584 /* set the matrix */ 585 upTriFactor->csrMat = new CsrMatrix; 586 upTriFactor->csrMat->num_rows = A->rmap->n; 587 upTriFactor->csrMat->num_cols = A->cmap->n; 588 upTriFactor->csrMat->num_entries = a->nz; 589 590 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 591 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 592 593 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 594 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 595 596 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 597 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 598 599 /* perform the solve analysis */ 600 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 601 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 602 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 603 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat); 604 605 /* assign the pointer. Is this really necessary? */ 606 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 607 608 /* allocate space for the triangular factor information */ 609 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 610 611 /* Create the matrix description */ 612 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat); 613 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 614 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 615 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 616 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 617 618 /* Create the solve analysis information */ 619 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat); 620 621 /* set the operation */ 622 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 623 624 /* set the matrix */ 625 loTriFactor->csrMat = new CsrMatrix; 626 loTriFactor->csrMat->num_rows = A->rmap->n; 627 loTriFactor->csrMat->num_cols = A->cmap->n; 628 loTriFactor->csrMat->num_entries = a->nz; 629 630 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 631 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 632 633 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 634 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 635 636 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 637 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 638 639 /* perform the solve analysis */ 640 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 641 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 642 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 643 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat); 644 645 /* assign the pointer. Is this really necessary? */ 646 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 647 648 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 649 ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 650 ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 651 ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 652 ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 653 } catch(char *ex) { 654 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 655 } 656 } 657 PetscFunctionReturn(0); 658 } 659 660 #undef __FUNCT__ 661 #define __FUNCT__ "MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU" 662 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 663 { 664 PetscErrorCode ierr; 665 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 666 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 667 IS ip = a->row; 668 const PetscInt *rip; 669 PetscBool perm_identity; 670 PetscInt n = A->rmap->n; 671 672 PetscFunctionBegin; 673 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 674 cusparseTriFactors->workVector = new THRUSTARRAY; 675 cusparseTriFactors->workVector->resize(n); 676 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 677 678 /*lower triangular indices */ 679 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 680 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 681 if (!perm_identity) { 682 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 683 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 684 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 685 cusparseTriFactors->cpermIndices->assign(rip, rip+n); 686 } 687 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 688 PetscFunctionReturn(0); 689 } 690 691 #undef __FUNCT__ 692 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE" 693 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 694 { 695 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 696 IS isrow = b->row,iscol = b->col; 697 PetscBool row_identity,col_identity; 698 PetscErrorCode ierr; 699 700 PetscFunctionBegin; 701 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 702 /* determine which version of MatSolve needs to be used. */ 703 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 704 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 705 if (row_identity && col_identity) { 706 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 707 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 708 } else { 709 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 710 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 711 } 712 713 /* get the triangular factors */ 714 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 715 PetscFunctionReturn(0); 716 } 717 718 #undef __FUNCT__ 719 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqAIJCUSPARSE" 720 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 721 { 722 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 723 IS ip = b->row; 724 PetscBool perm_identity; 725 PetscErrorCode ierr; 726 727 PetscFunctionBegin; 728 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 729 730 /* determine which version of MatSolve needs to be used. */ 731 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 732 if (perm_identity) { 733 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 734 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 735 } else { 736 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 737 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 738 } 739 740 /* get the triangular factors */ 741 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 742 PetscFunctionReturn(0); 743 } 744 745 #undef __FUNCT__ 746 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve" 747 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 748 { 749 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 750 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 751 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 752 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 753 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 754 cusparseStatus_t stat; 755 cusparseIndexBase_t indexBase; 756 cusparseMatrixType_t matrixType; 757 cusparseFillMode_t fillMode; 758 cusparseDiagType_t diagType; 759 760 PetscFunctionBegin; 761 762 /*********************************************/ 763 /* Now the Transpose of the Lower Tri Factor */ 764 /*********************************************/ 765 766 /* allocate space for the transpose of the lower triangular factor */ 767 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 768 769 /* set the matrix descriptors of the lower triangular factor */ 770 matrixType = cusparseGetMatType(loTriFactor->descr); 771 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 772 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 773 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 774 diagType = cusparseGetMatDiagType(loTriFactor->descr); 775 776 /* Create the matrix description */ 777 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSP(stat); 778 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSP(stat); 779 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSP(stat); 780 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSP(stat); 781 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSP(stat); 782 783 /* Create the solve analysis information */ 784 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSP(stat); 785 786 /* set the operation */ 787 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 788 789 /* allocate GPU space for the CSC of the lower triangular factor*/ 790 loTriFactorT->csrMat = new CsrMatrix; 791 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 792 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 793 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 794 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 795 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 796 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 797 798 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 799 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 800 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 801 loTriFactor->csrMat->values->data().get(), 802 loTriFactor->csrMat->row_offsets->data().get(), 803 loTriFactor->csrMat->column_indices->data().get(), 804 loTriFactorT->csrMat->values->data().get(), 805 loTriFactorT->csrMat->column_indices->data().get(), 806 loTriFactorT->csrMat->row_offsets->data().get(), 807 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 808 809 /* perform the solve analysis on the transposed matrix */ 810 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 811 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 812 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 813 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 814 loTriFactorT->solveInfo);CHKERRCUSP(stat); 815 816 /* assign the pointer. Is this really necessary? */ 817 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 818 819 /*********************************************/ 820 /* Now the Transpose of the Upper Tri Factor */ 821 /*********************************************/ 822 823 /* allocate space for the transpose of the upper triangular factor */ 824 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 825 826 /* set the matrix descriptors of the upper triangular factor */ 827 matrixType = cusparseGetMatType(upTriFactor->descr); 828 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 829 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 830 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 831 diagType = cusparseGetMatDiagType(upTriFactor->descr); 832 833 /* Create the matrix description */ 834 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSP(stat); 835 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSP(stat); 836 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSP(stat); 837 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSP(stat); 838 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSP(stat); 839 840 /* Create the solve analysis information */ 841 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSP(stat); 842 843 /* set the operation */ 844 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 845 846 /* allocate GPU space for the CSC of the upper triangular factor*/ 847 upTriFactorT->csrMat = new CsrMatrix; 848 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 849 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 850 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 851 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 852 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 853 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 854 855 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 856 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 857 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 858 upTriFactor->csrMat->values->data().get(), 859 upTriFactor->csrMat->row_offsets->data().get(), 860 upTriFactor->csrMat->column_indices->data().get(), 861 upTriFactorT->csrMat->values->data().get(), 862 upTriFactorT->csrMat->column_indices->data().get(), 863 upTriFactorT->csrMat->row_offsets->data().get(), 864 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 865 866 /* perform the solve analysis on the transposed matrix */ 867 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 868 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 869 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 870 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 871 upTriFactorT->solveInfo);CHKERRCUSP(stat); 872 873 /* assign the pointer. Is this really necessary? */ 874 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 875 PetscFunctionReturn(0); 876 } 877 878 #undef __FUNCT__ 879 #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult" 880 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 881 { 882 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 883 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 884 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 885 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 886 cusparseStatus_t stat; 887 cusparseIndexBase_t indexBase; 888 cudaError_t err; 889 890 PetscFunctionBegin; 891 892 /* allocate space for the triangular factor information */ 893 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 894 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSP(stat); 895 indexBase = cusparseGetMatIndexBase(matstruct->descr); 896 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSP(stat); 897 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat); 898 899 /* set alpha and beta */ 900 err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUSP(err); 901 err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 902 err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUSP(err); 903 err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 904 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat); 905 906 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 907 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 908 CsrMatrix *matrixT= new CsrMatrix; 909 matrixT->num_rows = A->rmap->n; 910 matrixT->num_cols = A->cmap->n; 911 matrixT->num_entries = a->nz; 912 matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 913 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 914 matrixT->values = new THRUSTARRAY(a->nz); 915 916 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 917 indexBase = cusparseGetMatIndexBase(matstruct->descr); 918 stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows, 919 matrix->num_cols, matrix->num_entries, 920 matrix->values->data().get(), 921 matrix->row_offsets->data().get(), 922 matrix->column_indices->data().get(), 923 matrixT->values->data().get(), 924 matrixT->column_indices->data().get(), 925 matrixT->row_offsets->data().get(), 926 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 927 928 /* assign the pointer */ 929 matstructT->mat = matrixT; 930 931 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 932 #if CUDA_VERSION>=5000 933 /* First convert HYB to CSR */ 934 CsrMatrix *temp= new CsrMatrix; 935 temp->num_rows = A->rmap->n; 936 temp->num_cols = A->cmap->n; 937 temp->num_entries = a->nz; 938 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 939 temp->column_indices = new THRUSTINTARRAY32(a->nz); 940 temp->values = new THRUSTARRAY(a->nz); 941 942 943 stat = cusparse_hyb2csr(cusparsestruct->handle, 944 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 945 temp->values->data().get(), 946 temp->row_offsets->data().get(), 947 temp->column_indices->data().get());CHKERRCUSP(stat); 948 949 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 950 CsrMatrix *tempT= new CsrMatrix; 951 tempT->num_rows = A->rmap->n; 952 tempT->num_cols = A->cmap->n; 953 tempT->num_entries = a->nz; 954 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 955 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 956 tempT->values = new THRUSTARRAY(a->nz); 957 958 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 959 temp->num_cols, temp->num_entries, 960 temp->values->data().get(), 961 temp->row_offsets->data().get(), 962 temp->column_indices->data().get(), 963 tempT->values->data().get(), 964 tempT->column_indices->data().get(), 965 tempT->row_offsets->data().get(), 966 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 967 968 /* Last, convert CSC to HYB */ 969 cusparseHybMat_t hybMat; 970 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat); 971 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 972 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 973 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 974 matstructT->descr, tempT->values->data().get(), 975 tempT->row_offsets->data().get(), 976 tempT->column_indices->data().get(), 977 hybMat, 0, partition);CHKERRCUSP(stat); 978 979 /* assign the pointer */ 980 matstructT->mat = hybMat; 981 982 /* delete temporaries */ 983 if (tempT) { 984 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 985 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 986 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 987 delete (CsrMatrix*) tempT; 988 } 989 if (temp) { 990 if (temp->values) delete (THRUSTARRAY*) temp->values; 991 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 992 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 993 delete (CsrMatrix*) temp; 994 } 995 #else 996 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later."); 997 #endif 998 } 999 /* assign the compressed row indices */ 1000 matstructT->cprowIndices = new THRUSTINTARRAY; 1001 1002 /* assign the pointer */ 1003 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1004 PetscFunctionReturn(0); 1005 } 1006 1007 #undef __FUNCT__ 1008 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE" 1009 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1010 { 1011 CUSPARRAY *xGPU, *bGPU; 1012 cusparseStatus_t stat; 1013 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1014 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1015 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1016 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1017 PetscErrorCode ierr; 1018 1019 PetscFunctionBegin; 1020 /* Analyze the matrix and create the transpose ... on the fly */ 1021 if (!loTriFactorT && !upTriFactorT) { 1022 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1023 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1024 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1025 } 1026 1027 /* Get the GPU pointers */ 1028 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1029 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1030 1031 /* First, reorder with the row permutation */ 1032 thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()), 1033 thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()), 1034 xGPU->begin()); 1035 1036 /* First, solve U */ 1037 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1038 upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 1039 upTriFactorT->csrMat->values->data().get(), 1040 upTriFactorT->csrMat->row_offsets->data().get(), 1041 upTriFactorT->csrMat->column_indices->data().get(), 1042 upTriFactorT->solveInfo, 1043 xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1044 1045 /* Then, solve L */ 1046 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1047 loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 1048 loTriFactorT->csrMat->values->data().get(), 1049 loTriFactorT->csrMat->row_offsets->data().get(), 1050 loTriFactorT->csrMat->column_indices->data().get(), 1051 loTriFactorT->solveInfo, 1052 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1053 1054 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1055 thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1056 thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()), 1057 tempGPU->begin()); 1058 1059 /* Copy the temporary to the full solution. */ 1060 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin()); 1061 1062 /* restore */ 1063 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1064 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1065 ierr = WaitForGPU();CHKERRCUSP(ierr); 1066 1067 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1068 PetscFunctionReturn(0); 1069 } 1070 1071 #undef __FUNCT__ 1072 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering" 1073 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1074 { 1075 CUSPARRAY *xGPU,*bGPU; 1076 cusparseStatus_t stat; 1077 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1078 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1079 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1080 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1081 PetscErrorCode ierr; 1082 1083 PetscFunctionBegin; 1084 /* Analyze the matrix and create the transpose ... on the fly */ 1085 if (!loTriFactorT && !upTriFactorT) { 1086 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1087 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1088 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1089 } 1090 1091 /* Get the GPU pointers */ 1092 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1093 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1094 1095 /* First, solve U */ 1096 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1097 upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 1098 upTriFactorT->csrMat->values->data().get(), 1099 upTriFactorT->csrMat->row_offsets->data().get(), 1100 upTriFactorT->csrMat->column_indices->data().get(), 1101 upTriFactorT->solveInfo, 1102 bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1103 1104 /* Then, solve L */ 1105 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1106 loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 1107 loTriFactorT->csrMat->values->data().get(), 1108 loTriFactorT->csrMat->row_offsets->data().get(), 1109 loTriFactorT->csrMat->column_indices->data().get(), 1110 loTriFactorT->solveInfo, 1111 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1112 1113 /* restore */ 1114 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1115 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1116 ierr = WaitForGPU();CHKERRCUSP(ierr); 1117 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1118 PetscFunctionReturn(0); 1119 } 1120 1121 #undef __FUNCT__ 1122 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE" 1123 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1124 { 1125 CUSPARRAY *xGPU,*bGPU; 1126 cusparseStatus_t stat; 1127 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1128 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1129 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1130 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1131 PetscErrorCode ierr; 1132 1133 PetscFunctionBegin; 1134 /* Get the GPU pointers */ 1135 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1136 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1137 1138 /* First, reorder with the row permutation */ 1139 thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()), 1140 thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()), 1141 xGPU->begin()); 1142 1143 /* Next, solve L */ 1144 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1145 loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1146 loTriFactor->csrMat->values->data().get(), 1147 loTriFactor->csrMat->row_offsets->data().get(), 1148 loTriFactor->csrMat->column_indices->data().get(), 1149 loTriFactor->solveInfo, 1150 xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1151 1152 /* Then, solve U */ 1153 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1154 upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1155 upTriFactor->csrMat->values->data().get(), 1156 upTriFactor->csrMat->row_offsets->data().get(), 1157 upTriFactor->csrMat->column_indices->data().get(), 1158 upTriFactor->solveInfo, 1159 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1160 1161 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1162 thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1163 thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()), 1164 tempGPU->begin()); 1165 1166 /* Copy the temporary to the full solution. */ 1167 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin()); 1168 1169 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1170 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1171 ierr = WaitForGPU();CHKERRCUSP(ierr); 1172 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1173 PetscFunctionReturn(0); 1174 } 1175 1176 #undef __FUNCT__ 1177 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering" 1178 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1179 { 1180 CUSPARRAY *xGPU,*bGPU; 1181 cusparseStatus_t stat; 1182 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1183 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1184 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1185 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1186 PetscErrorCode ierr; 1187 1188 PetscFunctionBegin; 1189 /* Get the GPU pointers */ 1190 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1191 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1192 1193 /* First, solve L */ 1194 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1195 loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1196 loTriFactor->csrMat->values->data().get(), 1197 loTriFactor->csrMat->row_offsets->data().get(), 1198 loTriFactor->csrMat->column_indices->data().get(), 1199 loTriFactor->solveInfo, 1200 bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1201 1202 /* Next, solve U */ 1203 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1204 upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1205 upTriFactor->csrMat->values->data().get(), 1206 upTriFactor->csrMat->row_offsets->data().get(), 1207 upTriFactor->csrMat->column_indices->data().get(), 1208 upTriFactor->solveInfo, 1209 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1210 1211 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1212 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1213 ierr = WaitForGPU();CHKERRCUSP(ierr); 1214 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1215 PetscFunctionReturn(0); 1216 } 1217 1218 #undef __FUNCT__ 1219 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU" 1220 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1221 { 1222 1223 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1224 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1225 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1226 PetscInt m = A->rmap->n,*ii,*ridx; 1227 PetscErrorCode ierr; 1228 cusparseStatus_t stat; 1229 cudaError_t err; 1230 1231 PetscFunctionBegin; 1232 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 1233 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1234 if (matstruct) { 1235 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Data structure should not be initialized."); 1236 } 1237 try { 1238 cusparsestruct->nonzerorow=0; 1239 for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0); 1240 1241 if (a->compressedrow.use) { 1242 m = a->compressedrow.nrows; 1243 ii = a->compressedrow.i; 1244 ridx = a->compressedrow.rindex; 1245 } else { 1246 /* Forcing compressed row on the GPU */ 1247 int k=0; 1248 ierr = PetscMalloc((cusparsestruct->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr); 1249 ierr = PetscMalloc((cusparsestruct->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr); 1250 ii[0]=0; 1251 for (int j = 0; j<m; j++) { 1252 if ((a->i[j+1]-a->i[j])>0) { 1253 ii[k] = a->i[j]; 1254 ridx[k]= j; 1255 k++; 1256 } 1257 } 1258 ii[cusparsestruct->nonzerorow] = a->nz; 1259 m = cusparsestruct->nonzerorow; 1260 } 1261 1262 /* allocate space for the triangular factor information */ 1263 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1264 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSP(stat); 1265 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 1266 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat); 1267 1268 err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUSP(err); 1269 err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 1270 err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUSP(err); 1271 err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 1272 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat); 1273 1274 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1275 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1276 /* set the matrix */ 1277 CsrMatrix *matrix= new CsrMatrix; 1278 matrix->num_rows = A->rmap->n; 1279 matrix->num_cols = A->cmap->n; 1280 matrix->num_entries = a->nz; 1281 matrix->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1282 matrix->row_offsets->assign(ii, ii + cusparsestruct->nonzerorow+1); 1283 1284 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1285 matrix->column_indices->assign(a->j, a->j+a->nz); 1286 1287 matrix->values = new THRUSTARRAY(a->nz); 1288 matrix->values->assign(a->a, a->a+a->nz); 1289 1290 /* assign the pointer */ 1291 matstruct->mat = matrix; 1292 1293 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1294 #if CUDA_VERSION>=4020 1295 CsrMatrix *matrix= new CsrMatrix; 1296 matrix->num_rows = A->rmap->n; 1297 matrix->num_cols = A->cmap->n; 1298 matrix->num_entries = a->nz; 1299 matrix->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1300 matrix->row_offsets->assign(ii, ii + cusparsestruct->nonzerorow+1); 1301 1302 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1303 matrix->column_indices->assign(a->j, a->j+a->nz); 1304 1305 matrix->values = new THRUSTARRAY(a->nz); 1306 matrix->values->assign(a->a, a->a+a->nz); 1307 1308 cusparseHybMat_t hybMat; 1309 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat); 1310 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1311 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1312 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1313 matstruct->descr, matrix->values->data().get(), 1314 matrix->row_offsets->data().get(), 1315 matrix->column_indices->data().get(), 1316 hybMat, 0, partition);CHKERRCUSP(stat); 1317 /* assign the pointer */ 1318 matstruct->mat = hybMat; 1319 1320 if (matrix) { 1321 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1322 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1323 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1324 delete (CsrMatrix*)matrix; 1325 } 1326 #endif 1327 } 1328 1329 /* assign the compressed row indices */ 1330 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1331 matstruct->cprowIndices->assign(ridx,ridx+m); 1332 1333 /* assign the pointer */ 1334 cusparsestruct->mat = matstruct; 1335 1336 if (!a->compressedrow.use) { 1337 ierr = PetscFree(ii);CHKERRQ(ierr); 1338 ierr = PetscFree(ridx);CHKERRQ(ierr); 1339 } 1340 cusparsestruct->workVector = new THRUSTARRAY; 1341 cusparsestruct->workVector->resize(m); 1342 } catch(char *ex) { 1343 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1344 } 1345 ierr = WaitForGPU();CHKERRCUSP(ierr); 1346 1347 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 1348 1349 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1350 } 1351 PetscFunctionReturn(0); 1352 } 1353 1354 #undef __FUNCT__ 1355 #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE" 1356 static PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left) 1357 { 1358 PetscErrorCode ierr; 1359 1360 PetscFunctionBegin; 1361 if (right) { 1362 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 1363 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 1364 ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr); 1365 ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr); 1366 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 1367 } 1368 if (left) { 1369 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 1370 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 1371 ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr); 1372 ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr); 1373 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 1374 } 1375 PetscFunctionReturn(0); 1376 } 1377 1378 struct VecCUSPPlusEquals 1379 { 1380 template <typename Tuple> 1381 __host__ __device__ 1382 void operator()(Tuple t) 1383 { 1384 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1385 } 1386 }; 1387 1388 #undef __FUNCT__ 1389 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE" 1390 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1391 { 1392 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1393 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1394 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1395 CUSPARRAY *xarray,*yarray; 1396 PetscErrorCode ierr; 1397 cusparseStatus_t stat; 1398 1399 PetscFunctionBegin; 1400 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1401 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1402 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1403 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1404 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1405 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1406 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1407 mat->num_rows, mat->num_cols, mat->num_entries, 1408 matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), 1409 mat->column_indices->data().get(), xarray->data().get(), matstruct->beta, 1410 yarray->data().get());CHKERRCUSP(stat); 1411 } else { 1412 #if CUDA_VERSION>=4020 1413 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1414 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1415 matstruct->alpha, matstruct->descr, hybMat, 1416 xarray->data().get(), matstruct->beta, 1417 yarray->data().get());CHKERRCUSP(stat); 1418 #endif 1419 } 1420 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1421 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1422 if (!cusparsestruct->stream) { 1423 ierr = WaitForGPU();CHKERRCUSP(ierr); 1424 } 1425 ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1426 PetscFunctionReturn(0); 1427 } 1428 1429 #undef __FUNCT__ 1430 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE" 1431 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1432 { 1433 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1434 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1435 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1436 CUSPARRAY *xarray,*yarray; 1437 PetscErrorCode ierr; 1438 cusparseStatus_t stat; 1439 1440 PetscFunctionBegin; 1441 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1442 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1443 if (!matstructT) { 1444 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1445 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1446 } 1447 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1448 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1449 1450 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1451 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1452 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1453 mat->num_rows, mat->num_cols, 1454 mat->num_entries, matstructT->alpha, matstructT->descr, 1455 mat->values->data().get(), mat->row_offsets->data().get(), 1456 mat->column_indices->data().get(), xarray->data().get(), matstructT->beta, 1457 yarray->data().get());CHKERRCUSP(stat); 1458 } else { 1459 #if CUDA_VERSION>=4020 1460 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1461 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1462 matstructT->alpha, matstructT->descr, hybMat, 1463 xarray->data().get(), matstructT->beta, 1464 yarray->data().get());CHKERRCUSP(stat); 1465 #endif 1466 } 1467 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1468 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1469 if (!cusparsestruct->stream) { 1470 ierr = WaitForGPU();CHKERRCUSP(ierr); 1471 } 1472 ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1473 PetscFunctionReturn(0); 1474 } 1475 1476 1477 #undef __FUNCT__ 1478 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 1479 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1480 { 1481 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1482 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1483 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1484 CUSPARRAY *xarray,*yarray,*zarray; 1485 PetscErrorCode ierr; 1486 cusparseStatus_t stat; 1487 1488 PetscFunctionBegin; 1489 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1490 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1491 try { 1492 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 1493 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1494 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 1495 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1496 1497 /* multiply add */ 1498 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1499 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1500 /* here we need to be careful to set the number of rows in the multiply to the 1501 number of compressed rows in the matrix ... which is equivalent to the 1502 size of the workVector */ 1503 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1504 cusparsestruct->workVector->size(), mat->num_cols, 1505 mat->num_entries, matstruct->alpha, matstruct->descr, 1506 mat->values->data().get(), mat->row_offsets->data().get(), 1507 mat->column_indices->data().get(), xarray->data().get(), matstruct->beta, 1508 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1509 } else { 1510 #if CUDA_VERSION>=4020 1511 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1512 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1513 matstruct->alpha, matstruct->descr, hybMat, 1514 xarray->data().get(), matstruct->beta, 1515 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1516 #endif 1517 } 1518 1519 /* scatter the data from the temporary into the full vector with a += operation */ 1520 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))), 1521 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1522 VecCUSPPlusEquals()); 1523 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1524 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 1525 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1526 1527 } catch(char *ex) { 1528 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1529 } 1530 ierr = WaitForGPU();CHKERRCUSP(ierr); 1531 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1532 PetscFunctionReturn(0); 1533 } 1534 1535 #undef __FUNCT__ 1536 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJCUSPARSE" 1537 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1538 { 1539 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1540 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1541 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1542 CUSPARRAY *xarray,*yarray,*zarray; 1543 PetscErrorCode ierr; 1544 cusparseStatus_t stat; 1545 1546 PetscFunctionBegin; 1547 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1548 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1549 if (!matstructT) { 1550 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1551 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1552 } 1553 1554 try { 1555 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 1556 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1557 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 1558 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1559 1560 /* multiply add with matrix transpose */ 1561 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1562 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1563 /* here we need to be careful to set the number of rows in the multiply to the 1564 number of compressed rows in the matrix ... which is equivalent to the 1565 size of the workVector */ 1566 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1567 cusparsestruct->workVector->size(), mat->num_cols, 1568 mat->num_entries, matstructT->alpha, matstructT->descr, 1569 mat->values->data().get(), mat->row_offsets->data().get(), 1570 mat->column_indices->data().get(), xarray->data().get(), matstructT->beta, 1571 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1572 } else { 1573 #if CUDA_VERSION>=4020 1574 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1575 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1576 matstructT->alpha, matstructT->descr, hybMat, 1577 xarray->data().get(), matstructT->beta, 1578 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1579 #endif 1580 } 1581 1582 /* scatter the data from the temporary into the full vector with a += operation */ 1583 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))), 1584 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1585 VecCUSPPlusEquals()); 1586 1587 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1588 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 1589 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1590 1591 } catch(char *ex) { 1592 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1593 } 1594 ierr = WaitForGPU();CHKERRCUSP(ierr); 1595 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1596 PetscFunctionReturn(0); 1597 } 1598 1599 #undef __FUNCT__ 1600 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE" 1601 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1602 { 1603 PetscErrorCode ierr; 1604 1605 PetscFunctionBegin; 1606 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1607 if (A->factortype==MAT_FACTOR_NONE) { 1608 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1609 } 1610 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1611 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1612 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1613 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1614 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1615 PetscFunctionReturn(0); 1616 } 1617 1618 /* --------------------------------------------------------------------------------*/ 1619 #undef __FUNCT__ 1620 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE" 1621 /*@ 1622 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1623 (the default parallel PETSc format). This matrix will ultimately pushed down 1624 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1625 assembly performance the user should preallocate the matrix storage by setting 1626 the parameter nz (or the array nnz). By setting these parameters accurately, 1627 performance during matrix assembly can be increased by more than a factor of 50. 1628 1629 Collective on MPI_Comm 1630 1631 Input Parameters: 1632 + comm - MPI communicator, set to PETSC_COMM_SELF 1633 . m - number of rows 1634 . n - number of columns 1635 . nz - number of nonzeros per row (same for all rows) 1636 - nnz - array containing the number of nonzeros in the various rows 1637 (possibly different for each row) or NULL 1638 1639 Output Parameter: 1640 . A - the matrix 1641 1642 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1643 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1644 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1645 1646 Notes: 1647 If nnz is given then nz is ignored 1648 1649 The AIJ format (also called the Yale sparse matrix format or 1650 compressed row storage), is fully compatible with standard Fortran 77 1651 storage. That is, the stored row and column indices can begin at 1652 either one (as in Fortran) or zero. See the users' manual for details. 1653 1654 Specify the preallocated storage with either nz or nnz (not both). 1655 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1656 allocation. For large problems you MUST preallocate memory or you 1657 will get TERRIBLE performance, see the users' manual chapter on matrices. 1658 1659 By default, this format uses inodes (identical nodes) when possible, to 1660 improve numerical efficiency of matrix-vector products and solves. We 1661 search for consecutive rows with the same nonzero structure, thereby 1662 reusing matrix information to achieve increased efficiency. 1663 1664 Level: intermediate 1665 1666 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1667 @*/ 1668 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1669 { 1670 PetscErrorCode ierr; 1671 1672 PetscFunctionBegin; 1673 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1674 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1675 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1676 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1677 PetscFunctionReturn(0); 1678 } 1679 1680 #undef __FUNCT__ 1681 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE" 1682 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1683 { 1684 PetscErrorCode ierr; 1685 cusparseStatus_t stat; 1686 cudaError_t err; 1687 PetscFunctionBegin; 1688 if (A->factortype==MAT_FACTOR_NONE) { 1689 try { 1690 if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) { 1691 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1692 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1693 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1694 1695 /* delete all the members in each of the data structures */ 1696 if (matstruct) { 1697 if (matstruct->descr) { stat = cusparseDestroyMatDescr(matstruct->descr);CHKERRCUSP(stat); } 1698 if (matstruct->mat) { 1699 if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1700 #if CUDA_VERSION>=4020 1701 cusparseHybMat_t hybMat = (cusparseHybMat_t) matstruct->mat; 1702 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat); 1703 #endif 1704 } else { 1705 CsrMatrix* mat = (CsrMatrix*)matstruct->mat; 1706 if (mat->values) delete (THRUSTARRAY*)mat->values; 1707 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1708 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1709 delete (CsrMatrix*)matstruct->mat; 1710 } 1711 } 1712 if (matstruct->alpha) { err=cudaFree(matstruct->alpha);CHKERRCUSP(err); } 1713 if (matstruct->beta) { err=cudaFree(matstruct->beta);CHKERRCUSP(err); } 1714 if (matstruct->cprowIndices) delete (THRUSTINTARRAY*)matstruct->cprowIndices; 1715 if (matstruct) delete (Mat_SeqAIJCUSPARSEMultStruct*)matstruct; 1716 } 1717 if (matstructT) { 1718 if (matstructT->descr) { stat = cusparseDestroyMatDescr(matstructT->descr);CHKERRCUSP(stat); } 1719 if (matstructT->mat) { 1720 if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1721 #if CUDA_VERSION>=4020 1722 cusparseHybMat_t hybMat = (cusparseHybMat_t) matstructT->mat; 1723 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat); 1724 #endif 1725 } else { 1726 CsrMatrix* matT = (CsrMatrix*)matstructT->mat; 1727 if (matT->values) delete (THRUSTARRAY*)matT->values; 1728 if (matT->column_indices) delete (THRUSTINTARRAY32*)matT->column_indices; 1729 if (matT->row_offsets) delete (THRUSTINTARRAY32*)matT->row_offsets; 1730 delete (CsrMatrix*)matstructT->mat; 1731 } 1732 } 1733 if (matstructT->alpha) { err=cudaFree(matstructT->alpha);CHKERRCUSP(err); } 1734 if (matstructT->beta) { err=cudaFree(matstructT->beta);CHKERRCUSP(err); } 1735 if (matstructT->cprowIndices) delete (THRUSTINTARRAY*)matstructT->cprowIndices; 1736 if (matstructT) delete (Mat_SeqAIJCUSPARSEMultStruct*)matstructT; 1737 } 1738 if (cusparsestruct->workVector) delete (THRUSTARRAY*)cusparsestruct->workVector; 1739 if (cusparsestruct->handle) { stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); } 1740 if (cusparsestruct) delete (Mat_SeqAIJCUSPARSE*)cusparsestruct; 1741 A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 1742 } else { 1743 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1744 if (cusparsestruct->handle) { stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); } 1745 if (cusparsestruct) delete (Mat_SeqAIJCUSPARSE*)cusparsestruct; 1746 } 1747 } catch(char *ex) { 1748 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1749 } 1750 } else { 1751 /* The triangular factors */ 1752 try { 1753 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1754 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1755 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1756 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1757 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1758 1759 /* delete all the members in each of the data structures */ 1760 if (loTriFactor) { 1761 if (loTriFactor->descr) { stat = cusparseDestroyMatDescr(loTriFactor->descr);CHKERRCUSP(stat); } 1762 if (loTriFactor->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(loTriFactor->solveInfo);CHKERRCUSP(stat); } 1763 if (loTriFactor->csrMat) { 1764 if (loTriFactor->csrMat->values) delete (THRUSTARRAY*)loTriFactor->csrMat->values; 1765 if (loTriFactor->csrMat->column_indices) delete (THRUSTINTARRAY32*)loTriFactor->csrMat->column_indices; 1766 if (loTriFactor->csrMat->row_offsets) delete (THRUSTINTARRAY32*)loTriFactor->csrMat->row_offsets; 1767 delete (CsrMatrix*)loTriFactor->csrMat; 1768 } 1769 if (loTriFactor) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)loTriFactor; 1770 } 1771 1772 if (upTriFactor) { 1773 if (upTriFactor->descr) { stat = cusparseDestroyMatDescr(upTriFactor->descr);CHKERRCUSP(stat); } 1774 if (upTriFactor->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(upTriFactor->solveInfo);CHKERRCUSP(stat); } 1775 if (upTriFactor->csrMat) { 1776 if (upTriFactor->csrMat->values) delete (THRUSTARRAY*)upTriFactor->csrMat->values; 1777 if (upTriFactor->csrMat->column_indices) delete (THRUSTINTARRAY32*)upTriFactor->csrMat->column_indices; 1778 if (upTriFactor->csrMat->row_offsets) delete (THRUSTINTARRAY32*)upTriFactor->csrMat->row_offsets; 1779 delete (CsrMatrix*)upTriFactor->csrMat; 1780 } 1781 if (upTriFactor) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)upTriFactor; 1782 } 1783 1784 if (loTriFactorT) { 1785 if (loTriFactorT->descr) { stat = cusparseDestroyMatDescr(loTriFactorT->descr);CHKERRCUSP(stat); } 1786 if (loTriFactorT->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(loTriFactorT->solveInfo);CHKERRCUSP(stat); } 1787 if (loTriFactorT->csrMat) { 1788 if (loTriFactorT->csrMat->values) delete (THRUSTARRAY*)loTriFactorT->csrMat->values; 1789 if (loTriFactorT->csrMat->column_indices) delete (THRUSTINTARRAY32*)loTriFactorT->csrMat->column_indices; 1790 if (loTriFactorT->csrMat->row_offsets) delete (THRUSTINTARRAY32*)loTriFactorT->csrMat->row_offsets; 1791 delete (CsrMatrix*)loTriFactorT->csrMat; 1792 } 1793 if (loTriFactorT) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)loTriFactorT; 1794 } 1795 1796 if (upTriFactorT) { 1797 if (upTriFactorT->descr) { stat = cusparseDestroyMatDescr(upTriFactorT->descr);CHKERRCUSP(stat); } 1798 if (upTriFactorT->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(upTriFactorT->solveInfo);CHKERRCUSP(stat); } 1799 if (upTriFactorT->csrMat) { 1800 if (upTriFactorT->csrMat->values) delete (THRUSTARRAY*)upTriFactorT->csrMat->values; 1801 if (upTriFactorT->csrMat->column_indices) delete (THRUSTINTARRAY32*)upTriFactorT->csrMat->column_indices; 1802 if (upTriFactorT->csrMat->row_offsets) delete (THRUSTINTARRAY32*)upTriFactorT->csrMat->row_offsets; 1803 delete (CsrMatrix*)upTriFactorT->csrMat; 1804 } 1805 if (upTriFactorT) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)upTriFactorT; 1806 } 1807 if (cusparseTriFactors->rpermIndices) delete (THRUSTINTARRAY*)cusparseTriFactors->rpermIndices; 1808 if (cusparseTriFactors->cpermIndices) delete (THRUSTINTARRAY*)cusparseTriFactors->cpermIndices; 1809 if (cusparseTriFactors->workVector) delete (THRUSTARRAY*)cusparseTriFactors->workVector; 1810 if (cusparseTriFactors->handle) { stat = cusparseDestroy(cusparseTriFactors->handle);CHKERRCUSP(stat); } 1811 if (cusparseTriFactors) delete (Mat_SeqAIJCUSPARSETriFactors*)cusparseTriFactors; 1812 } catch(char *ex) { 1813 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1814 } 1815 } 1816 /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */ 1817 A->spptr = 0; 1818 1819 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1820 PetscFunctionReturn(0); 1821 } 1822 1823 #undef __FUNCT__ 1824 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE" 1825 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 1826 { 1827 PetscErrorCode ierr; 1828 cusparseStatus_t stat; 1829 cusparseHandle_t handle=0; 1830 1831 PetscFunctionBegin; 1832 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 1833 if (B->factortype==MAT_FACTOR_NONE) { 1834 /* you cannot check the inode.use flag here since the matrix was just created. 1835 now build a GPU matrix data structure */ 1836 B->spptr = new Mat_SeqAIJCUSPARSE; 1837 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1838 ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0; 1839 ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0; 1840 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1841 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1842 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = 0; 1843 stat = cusparseCreate(&handle);CHKERRCUSP(stat); 1844 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle; 1845 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1846 } else { 1847 /* NEXT, set the pointers to the triangular factors */ 1848 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1849 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 1850 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1851 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0; 1852 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0; 1853 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0; 1854 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0; 1855 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0; 1856 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->format = cusparseMatSolveStorageFormat; 1857 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = 0; 1858 stat = cusparseCreate(&handle);CHKERRCUSP(stat); 1859 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle; 1860 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0; 1861 } 1862 1863 /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the 1864 default cusparse tri solve. Note the difference with the implementation in 1865 MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */ 1866 ierr = PetscObjectComposeFunction((PetscObject)B,"MatGetFactor_petsc_C",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr); 1867 1868 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1869 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1870 B->ops->getvecs = MatGetVecs_SeqAIJCUSPARSE; 1871 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1872 B->ops->mult = MatMult_SeqAIJCUSPARSE; 1873 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1874 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1875 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1876 1877 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1878 1879 B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 1880 1881 ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1882 PetscFunctionReturn(0); 1883 } 1884 1885 /*M 1886 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1887 1888 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1889 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 1890 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 1891 1892 Options Database Keys: 1893 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1894 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1895 . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1896 - -mat_cusparse_solve_storage_format csr - sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1897 1898 Level: beginner 1899 1900 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1901 M*/ 1902