1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 6 #include <petscconf.h> 7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 8 #include <../src/mat/impls/sbaij/seq/sbaij.h> 9 #include <../src/vec/vec/impls/dvecimpl.h> 10 #include <petsc-private/vecimpl.h> 11 #undef VecType 12 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 13 14 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 15 16 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 17 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 18 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 19 20 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 21 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 22 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 23 24 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 26 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 28 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat); 29 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 30 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 31 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 32 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 33 34 #undef __FUNCT__ 35 #define __FUNCT__ "MatCUSPARSESetStream" 36 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 37 { 38 cusparseStatus_t stat; 39 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 40 41 PetscFunctionBegin; 42 cusparsestruct->stream = stream; 43 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSP(stat); 44 PetscFunctionReturn(0); 45 } 46 47 #undef __FUNCT__ 48 #define __FUNCT__ "MatCUSPARSESetHandle" 49 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 50 { 51 cusparseStatus_t stat; 52 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 53 54 PetscFunctionBegin; 55 if (cusparsestruct->handle) 56 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); 57 cusparsestruct->handle = handle; 58 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat); 59 PetscFunctionReturn(0); 60 } 61 62 #undef __FUNCT__ 63 #define __FUNCT__ "MatCUSPARSEClearHandle" 64 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 65 { 66 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 67 PetscFunctionBegin; 68 if (cusparsestruct->handle) 69 cusparsestruct->handle = 0; 70 PetscFunctionReturn(0); 71 } 72 73 #undef __FUNCT__ 74 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse" 75 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type) 76 { 77 PetscFunctionBegin; 78 *type = MATSOLVERCUSPARSE; 79 PetscFunctionReturn(0); 80 } 81 82 /*MC 83 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 84 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 85 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 86 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 87 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 88 algorithms are not recommended. This class does NOT support direct solver operations. 89 90 Level: beginner 91 92 .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 93 M*/ 94 95 #undef __FUNCT__ 96 #define __FUNCT__ "MatGetFactor_seqaij_cusparse" 97 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B) 98 { 99 PetscErrorCode ierr; 100 PetscInt n = A->rmap->n; 101 102 PetscFunctionBegin; 103 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 104 (*B)->factortype = ftype; 105 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 106 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 107 108 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 109 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 110 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 111 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 112 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 113 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 114 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 115 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 116 117 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 118 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr); 119 PetscFunctionReturn(0); 120 } 121 122 #undef __FUNCT__ 123 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE" 124 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 125 { 126 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 127 128 PetscFunctionBegin; 129 #if CUDA_VERSION>=4020 130 switch (op) { 131 case MAT_CUSPARSE_MULT: 132 cusparsestruct->format = format; 133 break; 134 case MAT_CUSPARSE_ALL: 135 cusparsestruct->format = format; 136 break; 137 default: 138 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 139 } 140 #else 141 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) 142 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later."); 143 #endif 144 PetscFunctionReturn(0); 145 } 146 147 /*@ 148 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 149 operation. Only the MatMult operation can use different GPU storage formats 150 for MPIAIJCUSPARSE matrices. 151 Not Collective 152 153 Input Parameters: 154 + A - Matrix of type SEQAIJCUSPARSE 155 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 156 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 157 158 Output Parameter: 159 160 Level: intermediate 161 162 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 163 @*/ 164 #undef __FUNCT__ 165 #define __FUNCT__ "MatCUSPARSESetFormat" 166 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 167 { 168 PetscErrorCode ierr; 169 170 PetscFunctionBegin; 171 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 172 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 173 PetscFunctionReturn(0); 174 } 175 176 #undef __FUNCT__ 177 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE" 178 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A) 179 { 180 PetscErrorCode ierr; 181 MatCUSPARSEStorageFormat format; 182 PetscBool flg; 183 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 184 185 PetscFunctionBegin; 186 ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr); 187 ierr = PetscObjectOptionsBegin((PetscObject)A); 188 if (A->factortype==MAT_FACTOR_NONE) { 189 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 190 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 191 if (flg) { 192 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 193 } 194 } 195 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 196 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 197 if (flg) { 198 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 199 } 200 ierr = PetscOptionsEnd();CHKERRQ(ierr); 201 PetscFunctionReturn(0); 202 203 } 204 205 #undef __FUNCT__ 206 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE" 207 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 208 { 209 PetscErrorCode ierr; 210 211 PetscFunctionBegin; 212 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 213 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 214 PetscFunctionReturn(0); 215 } 216 217 #undef __FUNCT__ 218 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE" 219 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 220 { 221 PetscErrorCode ierr; 222 223 PetscFunctionBegin; 224 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 225 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 226 PetscFunctionReturn(0); 227 } 228 229 #undef __FUNCT__ 230 #define __FUNCT__ "MatICCFactorSymbolic_SeqAIJCUSPARSE" 231 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 232 { 233 PetscErrorCode ierr; 234 235 PetscFunctionBegin; 236 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 237 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 238 PetscFunctionReturn(0); 239 } 240 241 #undef __FUNCT__ 242 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqAIJCUSPARSE" 243 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 244 { 245 PetscErrorCode ierr; 246 247 PetscFunctionBegin; 248 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 249 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 250 PetscFunctionReturn(0); 251 } 252 253 #undef __FUNCT__ 254 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILULowerTriMatrix" 255 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 256 { 257 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 258 PetscInt n = A->rmap->n; 259 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 260 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 261 cusparseStatus_t stat; 262 const PetscInt *ai = a->i,*aj = a->j,*vi; 263 const MatScalar *aa = a->a,*v; 264 PetscInt *AiLo, *AjLo; 265 PetscScalar *AALo; 266 PetscInt i,nz, nzLower, offset, rowOffset; 267 PetscErrorCode ierr; 268 269 PetscFunctionBegin; 270 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 271 try { 272 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 273 nzLower=n+ai[n]-ai[1]; 274 275 /* Allocate Space for the lower triangular matrix */ 276 ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 277 ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr); 278 ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr); 279 280 /* Fill the lower triangular matrix */ 281 AiLo[0] = (PetscInt) 0; 282 AiLo[n] = nzLower; 283 AjLo[0] = (PetscInt) 0; 284 AALo[0] = (MatScalar) 1.0; 285 v = aa; 286 vi = aj; 287 offset = 1; 288 rowOffset= 1; 289 for (i=1; i<n; i++) { 290 nz = ai[i+1] - ai[i]; 291 /* additional 1 for the term on the diagonal */ 292 AiLo[i] = rowOffset; 293 rowOffset += nz+1; 294 295 ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 296 ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 297 298 offset += nz; 299 AjLo[offset] = (PetscInt) i; 300 AALo[offset] = (MatScalar) 1.0; 301 offset += 1; 302 303 v += nz; 304 vi += nz; 305 } 306 307 /* allocate space for the triangular factor information */ 308 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 309 310 /* Create the matrix description */ 311 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat); 312 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 313 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 314 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat); 315 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat); 316 317 /* Create the solve analysis information */ 318 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat); 319 320 /* set the operation */ 321 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 322 323 /* set the matrix */ 324 loTriFactor->csrMat = new CsrMatrix; 325 loTriFactor->csrMat->num_rows = n; 326 loTriFactor->csrMat->num_cols = n; 327 loTriFactor->csrMat->num_entries = nzLower; 328 329 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 330 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 331 332 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 333 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 334 335 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 336 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 337 338 /* perform the solve analysis */ 339 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 340 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 341 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 342 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat); 343 344 /* assign the pointer. Is this really necessary? */ 345 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 346 347 ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr); 348 ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr); 349 ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 350 } catch(char *ex) { 351 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 352 } 353 } 354 PetscFunctionReturn(0); 355 } 356 357 #undef __FUNCT__ 358 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILUUpperTriMatrix" 359 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 360 { 361 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 362 PetscInt n = A->rmap->n; 363 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 364 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 365 cusparseStatus_t stat; 366 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 367 const MatScalar *aa = a->a,*v; 368 PetscInt *AiUp, *AjUp; 369 PetscScalar *AAUp; 370 PetscInt i,nz, nzUpper, offset; 371 PetscErrorCode ierr; 372 373 PetscFunctionBegin; 374 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 375 try { 376 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 377 nzUpper = adiag[0]-adiag[n]; 378 379 /* Allocate Space for the upper triangular matrix */ 380 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 381 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 382 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 383 384 /* Fill the upper triangular matrix */ 385 AiUp[0]=(PetscInt) 0; 386 AiUp[n]=nzUpper; 387 offset = nzUpper; 388 for (i=n-1; i>=0; i--) { 389 v = aa + adiag[i+1] + 1; 390 vi = aj + adiag[i+1] + 1; 391 392 /* number of elements NOT on the diagonal */ 393 nz = adiag[i] - adiag[i+1]-1; 394 395 /* decrement the offset */ 396 offset -= (nz+1); 397 398 /* first, set the diagonal elements */ 399 AjUp[offset] = (PetscInt) i; 400 AAUp[offset] = 1./v[nz]; 401 AiUp[i] = AiUp[i+1] - (nz+1); 402 403 ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 404 ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 405 } 406 407 /* allocate space for the triangular factor information */ 408 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 409 410 /* Create the matrix description */ 411 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat); 412 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 413 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 414 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 415 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 416 417 /* Create the solve analysis information */ 418 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat); 419 420 /* set the operation */ 421 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 422 423 /* set the matrix */ 424 upTriFactor->csrMat = new CsrMatrix; 425 upTriFactor->csrMat->num_rows = n; 426 upTriFactor->csrMat->num_cols = n; 427 upTriFactor->csrMat->num_entries = nzUpper; 428 429 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 430 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 431 432 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 433 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 434 435 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 436 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 437 438 /* perform the solve analysis */ 439 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 440 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 441 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 442 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat); 443 444 /* assign the pointer. Is this really necessary? */ 445 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 446 447 ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 448 ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 449 ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 450 } catch(char *ex) { 451 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 452 } 453 } 454 PetscFunctionReturn(0); 455 } 456 457 #undef __FUNCT__ 458 #define __FUNCT__ "MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU" 459 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 460 { 461 PetscErrorCode ierr; 462 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 463 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 464 IS isrow = a->row,iscol = a->icol; 465 PetscBool row_identity,col_identity; 466 const PetscInt *r,*c; 467 PetscInt n = A->rmap->n; 468 469 PetscFunctionBegin; 470 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 471 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 472 473 cusparseTriFactors->workVector = new THRUSTARRAY; 474 cusparseTriFactors->workVector->resize(n); 475 cusparseTriFactors->nnz=a->nz; 476 477 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 478 /*lower triangular indices */ 479 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 480 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 481 if (!row_identity) { 482 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 483 cusparseTriFactors->rpermIndices->assign(r, r+n); 484 } 485 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 486 487 /*upper triangular indices */ 488 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 489 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 490 if (!col_identity) { 491 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 492 cusparseTriFactors->cpermIndices->assign(c, c+n); 493 } 494 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 495 PetscFunctionReturn(0); 496 } 497 498 #undef __FUNCT__ 499 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildICCTriMatrices" 500 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 501 { 502 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 503 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 504 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 505 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 506 cusparseStatus_t stat; 507 PetscErrorCode ierr; 508 PetscInt *AiUp, *AjUp; 509 PetscScalar *AAUp; 510 PetscScalar *AALo; 511 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 512 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 513 const PetscInt *ai = b->i,*aj = b->j,*vj; 514 const MatScalar *aa = b->a,*v; 515 516 PetscFunctionBegin; 517 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 518 try { 519 /* Allocate Space for the upper triangular matrix */ 520 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 521 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 522 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 523 ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 524 525 /* Fill the upper triangular matrix */ 526 AiUp[0]=(PetscInt) 0; 527 AiUp[n]=nzUpper; 528 offset = 0; 529 for (i=0; i<n; i++) { 530 /* set the pointers */ 531 v = aa + ai[i]; 532 vj = aj + ai[i]; 533 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 534 535 /* first, set the diagonal elements */ 536 AjUp[offset] = (PetscInt) i; 537 AAUp[offset] = 1.0/v[nz]; 538 AiUp[i] = offset; 539 AALo[offset] = 1.0/v[nz]; 540 541 offset+=1; 542 if (nz>0) { 543 ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr); 544 ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 545 for (j=offset; j<offset+nz; j++) { 546 AAUp[j] = -AAUp[j]; 547 AALo[j] = AAUp[j]/v[nz]; 548 } 549 offset+=nz; 550 } 551 } 552 553 /* allocate space for the triangular factor information */ 554 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 555 556 /* Create the matrix description */ 557 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat); 558 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 559 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 560 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 561 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat); 562 563 /* Create the solve analysis information */ 564 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat); 565 566 /* set the operation */ 567 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 568 569 /* set the matrix */ 570 upTriFactor->csrMat = new CsrMatrix; 571 upTriFactor->csrMat->num_rows = A->rmap->n; 572 upTriFactor->csrMat->num_cols = A->cmap->n; 573 upTriFactor->csrMat->num_entries = a->nz; 574 575 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 576 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 577 578 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 579 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 580 581 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 582 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 583 584 /* perform the solve analysis */ 585 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 586 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 587 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 588 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat); 589 590 /* assign the pointer. Is this really necessary? */ 591 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 592 593 /* allocate space for the triangular factor information */ 594 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 595 596 /* Create the matrix description */ 597 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat); 598 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 599 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 600 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 601 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 602 603 /* Create the solve analysis information */ 604 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat); 605 606 /* set the operation */ 607 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 608 609 /* set the matrix */ 610 loTriFactor->csrMat = new CsrMatrix; 611 loTriFactor->csrMat->num_rows = A->rmap->n; 612 loTriFactor->csrMat->num_cols = A->cmap->n; 613 loTriFactor->csrMat->num_entries = a->nz; 614 615 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 616 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 617 618 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 619 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 620 621 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 622 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 623 624 /* perform the solve analysis */ 625 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 626 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 627 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 628 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat); 629 630 /* assign the pointer. Is this really necessary? */ 631 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 632 633 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 634 ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 635 ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 636 ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 637 ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 638 } catch(char *ex) { 639 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 640 } 641 } 642 PetscFunctionReturn(0); 643 } 644 645 #undef __FUNCT__ 646 #define __FUNCT__ "MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU" 647 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 648 { 649 PetscErrorCode ierr; 650 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 651 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 652 IS ip = a->row; 653 const PetscInt *rip; 654 PetscBool perm_identity; 655 PetscInt n = A->rmap->n; 656 657 PetscFunctionBegin; 658 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 659 cusparseTriFactors->workVector = new THRUSTARRAY; 660 cusparseTriFactors->workVector->resize(n); 661 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 662 663 /*lower triangular indices */ 664 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 665 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 666 if (!perm_identity) { 667 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 668 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 669 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 670 cusparseTriFactors->cpermIndices->assign(rip, rip+n); 671 } 672 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 673 PetscFunctionReturn(0); 674 } 675 676 #undef __FUNCT__ 677 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE" 678 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 679 { 680 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 681 IS isrow = b->row,iscol = b->col; 682 PetscBool row_identity,col_identity; 683 PetscErrorCode ierr; 684 685 PetscFunctionBegin; 686 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 687 /* determine which version of MatSolve needs to be used. */ 688 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 689 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 690 if (row_identity && col_identity) { 691 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 692 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 693 } else { 694 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 695 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 696 } 697 698 /* get the triangular factors */ 699 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 700 PetscFunctionReturn(0); 701 } 702 703 #undef __FUNCT__ 704 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqAIJCUSPARSE" 705 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 706 { 707 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 708 IS ip = b->row; 709 PetscBool perm_identity; 710 PetscErrorCode ierr; 711 712 PetscFunctionBegin; 713 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 714 715 /* determine which version of MatSolve needs to be used. */ 716 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 717 if (perm_identity) { 718 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 719 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 720 } else { 721 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 722 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 723 } 724 725 /* get the triangular factors */ 726 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 727 PetscFunctionReturn(0); 728 } 729 730 #undef __FUNCT__ 731 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve" 732 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 733 { 734 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 735 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 736 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 737 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 738 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 739 cusparseStatus_t stat; 740 cusparseIndexBase_t indexBase; 741 cusparseMatrixType_t matrixType; 742 cusparseFillMode_t fillMode; 743 cusparseDiagType_t diagType; 744 745 PetscFunctionBegin; 746 747 /*********************************************/ 748 /* Now the Transpose of the Lower Tri Factor */ 749 /*********************************************/ 750 751 /* allocate space for the transpose of the lower triangular factor */ 752 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 753 754 /* set the matrix descriptors of the lower triangular factor */ 755 matrixType = cusparseGetMatType(loTriFactor->descr); 756 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 757 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 758 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 759 diagType = cusparseGetMatDiagType(loTriFactor->descr); 760 761 /* Create the matrix description */ 762 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSP(stat); 763 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSP(stat); 764 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSP(stat); 765 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSP(stat); 766 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSP(stat); 767 768 /* Create the solve analysis information */ 769 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSP(stat); 770 771 /* set the operation */ 772 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 773 774 /* allocate GPU space for the CSC of the lower triangular factor*/ 775 loTriFactorT->csrMat = new CsrMatrix; 776 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 777 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 778 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 779 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 780 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 781 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 782 783 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 784 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 785 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 786 loTriFactor->csrMat->values->data().get(), 787 loTriFactor->csrMat->row_offsets->data().get(), 788 loTriFactor->csrMat->column_indices->data().get(), 789 loTriFactorT->csrMat->values->data().get(), 790 loTriFactorT->csrMat->column_indices->data().get(), 791 loTriFactorT->csrMat->row_offsets->data().get(), 792 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 793 794 /* perform the solve analysis on the transposed matrix */ 795 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 796 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 797 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 798 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 799 loTriFactorT->solveInfo);CHKERRCUSP(stat); 800 801 /* assign the pointer. Is this really necessary? */ 802 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 803 804 /*********************************************/ 805 /* Now the Transpose of the Upper Tri Factor */ 806 /*********************************************/ 807 808 /* allocate space for the transpose of the upper triangular factor */ 809 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 810 811 /* set the matrix descriptors of the upper triangular factor */ 812 matrixType = cusparseGetMatType(upTriFactor->descr); 813 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 814 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 815 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 816 diagType = cusparseGetMatDiagType(upTriFactor->descr); 817 818 /* Create the matrix description */ 819 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSP(stat); 820 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSP(stat); 821 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSP(stat); 822 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSP(stat); 823 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSP(stat); 824 825 /* Create the solve analysis information */ 826 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSP(stat); 827 828 /* set the operation */ 829 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 830 831 /* allocate GPU space for the CSC of the upper triangular factor*/ 832 upTriFactorT->csrMat = new CsrMatrix; 833 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 834 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 835 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 836 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 837 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 838 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 839 840 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 841 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 842 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 843 upTriFactor->csrMat->values->data().get(), 844 upTriFactor->csrMat->row_offsets->data().get(), 845 upTriFactor->csrMat->column_indices->data().get(), 846 upTriFactorT->csrMat->values->data().get(), 847 upTriFactorT->csrMat->column_indices->data().get(), 848 upTriFactorT->csrMat->row_offsets->data().get(), 849 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 850 851 /* perform the solve analysis on the transposed matrix */ 852 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 853 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 854 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 855 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 856 upTriFactorT->solveInfo);CHKERRCUSP(stat); 857 858 /* assign the pointer. Is this really necessary? */ 859 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 860 PetscFunctionReturn(0); 861 } 862 863 #undef __FUNCT__ 864 #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult" 865 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 866 { 867 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 868 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 869 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 870 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 871 cusparseStatus_t stat; 872 cusparseIndexBase_t indexBase; 873 cudaError_t err; 874 875 PetscFunctionBegin; 876 877 /* allocate space for the triangular factor information */ 878 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 879 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSP(stat); 880 indexBase = cusparseGetMatIndexBase(matstruct->descr); 881 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSP(stat); 882 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat); 883 884 /* set alpha and beta */ 885 err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUSP(err); 886 err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 887 err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUSP(err); 888 err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 889 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat); 890 891 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 892 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 893 CsrMatrix *matrixT= new CsrMatrix; 894 matrixT->num_rows = A->rmap->n; 895 matrixT->num_cols = A->cmap->n; 896 matrixT->num_entries = a->nz; 897 matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 898 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 899 matrixT->values = new THRUSTARRAY(a->nz); 900 901 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 902 indexBase = cusparseGetMatIndexBase(matstruct->descr); 903 stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows, 904 matrix->num_cols, matrix->num_entries, 905 matrix->values->data().get(), 906 matrix->row_offsets->data().get(), 907 matrix->column_indices->data().get(), 908 matrixT->values->data().get(), 909 matrixT->column_indices->data().get(), 910 matrixT->row_offsets->data().get(), 911 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 912 913 /* assign the pointer */ 914 matstructT->mat = matrixT; 915 916 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 917 #if CUDA_VERSION>=5000 918 /* First convert HYB to CSR */ 919 CsrMatrix *temp= new CsrMatrix; 920 temp->num_rows = A->rmap->n; 921 temp->num_cols = A->cmap->n; 922 temp->num_entries = a->nz; 923 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 924 temp->column_indices = new THRUSTINTARRAY32(a->nz); 925 temp->values = new THRUSTARRAY(a->nz); 926 927 928 stat = cusparse_hyb2csr(cusparsestruct->handle, 929 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 930 temp->values->data().get(), 931 temp->row_offsets->data().get(), 932 temp->column_indices->data().get());CHKERRCUSP(stat); 933 934 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 935 CsrMatrix *tempT= new CsrMatrix; 936 tempT->num_rows = A->rmap->n; 937 tempT->num_cols = A->cmap->n; 938 tempT->num_entries = a->nz; 939 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 940 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 941 tempT->values = new THRUSTARRAY(a->nz); 942 943 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 944 temp->num_cols, temp->num_entries, 945 temp->values->data().get(), 946 temp->row_offsets->data().get(), 947 temp->column_indices->data().get(), 948 tempT->values->data().get(), 949 tempT->column_indices->data().get(), 950 tempT->row_offsets->data().get(), 951 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 952 953 /* Last, convert CSC to HYB */ 954 cusparseHybMat_t hybMat; 955 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat); 956 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 957 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 958 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 959 matstructT->descr, tempT->values->data().get(), 960 tempT->row_offsets->data().get(), 961 tempT->column_indices->data().get(), 962 hybMat, 0, partition);CHKERRCUSP(stat); 963 964 /* assign the pointer */ 965 matstructT->mat = hybMat; 966 967 /* delete temporaries */ 968 if (tempT) { 969 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 970 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 971 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 972 delete (CsrMatrix*) tempT; 973 } 974 if (temp) { 975 if (temp->values) delete (THRUSTARRAY*) temp->values; 976 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 977 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 978 delete (CsrMatrix*) temp; 979 } 980 #else 981 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later."); 982 #endif 983 } 984 /* assign the compressed row indices */ 985 matstructT->cprowIndices = new THRUSTINTARRAY; 986 987 /* assign the pointer */ 988 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 989 PetscFunctionReturn(0); 990 } 991 992 #undef __FUNCT__ 993 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE" 994 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 995 { 996 CUSPARRAY *xGPU, *bGPU; 997 cusparseStatus_t stat; 998 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 999 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1000 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1001 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1002 PetscErrorCode ierr; 1003 1004 PetscFunctionBegin; 1005 /* Analyze the matrix and create the transpose ... on the fly */ 1006 if (!loTriFactorT && !upTriFactorT) { 1007 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1008 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1009 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1010 } 1011 1012 /* Get the GPU pointers */ 1013 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1014 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1015 1016 /* First, reorder with the row permutation */ 1017 thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()), 1018 thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()), 1019 xGPU->begin()); 1020 1021 /* First, solve U */ 1022 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1023 upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 1024 upTriFactorT->csrMat->values->data().get(), 1025 upTriFactorT->csrMat->row_offsets->data().get(), 1026 upTriFactorT->csrMat->column_indices->data().get(), 1027 upTriFactorT->solveInfo, 1028 xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1029 1030 /* Then, solve L */ 1031 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1032 loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 1033 loTriFactorT->csrMat->values->data().get(), 1034 loTriFactorT->csrMat->row_offsets->data().get(), 1035 loTriFactorT->csrMat->column_indices->data().get(), 1036 loTriFactorT->solveInfo, 1037 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1038 1039 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1040 thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1041 thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()), 1042 tempGPU->begin()); 1043 1044 /* Copy the temporary to the full solution. */ 1045 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin()); 1046 1047 /* restore */ 1048 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1049 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1050 ierr = WaitForGPU();CHKERRCUSP(ierr); 1051 1052 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1053 PetscFunctionReturn(0); 1054 } 1055 1056 #undef __FUNCT__ 1057 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering" 1058 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1059 { 1060 CUSPARRAY *xGPU,*bGPU; 1061 cusparseStatus_t stat; 1062 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1063 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1064 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1065 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1066 PetscErrorCode ierr; 1067 1068 PetscFunctionBegin; 1069 /* Analyze the matrix and create the transpose ... on the fly */ 1070 if (!loTriFactorT && !upTriFactorT) { 1071 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1072 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1073 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1074 } 1075 1076 /* Get the GPU pointers */ 1077 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1078 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1079 1080 /* First, solve U */ 1081 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1082 upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 1083 upTriFactorT->csrMat->values->data().get(), 1084 upTriFactorT->csrMat->row_offsets->data().get(), 1085 upTriFactorT->csrMat->column_indices->data().get(), 1086 upTriFactorT->solveInfo, 1087 bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1088 1089 /* Then, solve L */ 1090 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1091 loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 1092 loTriFactorT->csrMat->values->data().get(), 1093 loTriFactorT->csrMat->row_offsets->data().get(), 1094 loTriFactorT->csrMat->column_indices->data().get(), 1095 loTriFactorT->solveInfo, 1096 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1097 1098 /* restore */ 1099 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1100 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1101 ierr = WaitForGPU();CHKERRCUSP(ierr); 1102 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1103 PetscFunctionReturn(0); 1104 } 1105 1106 #undef __FUNCT__ 1107 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE" 1108 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1109 { 1110 CUSPARRAY *xGPU,*bGPU; 1111 cusparseStatus_t stat; 1112 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1113 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1114 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1115 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1116 PetscErrorCode ierr; 1117 VecType t; 1118 PetscBool flg; 1119 1120 PetscFunctionBegin; 1121 ierr = VecGetType(bb,&t);CHKERRQ(ierr); 1122 ierr = PetscStrcmp(t,VECSEQCUSP,&flg);CHKERRQ(ierr); 1123 if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #2). Can only deal with %s\n.",t,VECSEQCUSP); 1124 ierr = VecGetType(xx,&t);CHKERRQ(ierr); 1125 ierr = PetscStrcmp(t,VECSEQCUSP,&flg);CHKERRQ(ierr); 1126 if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #3). Can only deal with %s\n.",t,VECSEQCUSP); 1127 1128 /* Get the GPU pointers */ 1129 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1130 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1131 1132 /* First, reorder with the row permutation */ 1133 thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()), 1134 thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()), 1135 xGPU->begin()); 1136 1137 /* Next, solve L */ 1138 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1139 loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1140 loTriFactor->csrMat->values->data().get(), 1141 loTriFactor->csrMat->row_offsets->data().get(), 1142 loTriFactor->csrMat->column_indices->data().get(), 1143 loTriFactor->solveInfo, 1144 xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1145 1146 /* Then, solve U */ 1147 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1148 upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1149 upTriFactor->csrMat->values->data().get(), 1150 upTriFactor->csrMat->row_offsets->data().get(), 1151 upTriFactor->csrMat->column_indices->data().get(), 1152 upTriFactor->solveInfo, 1153 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1154 1155 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1156 thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1157 thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()), 1158 tempGPU->begin()); 1159 1160 /* Copy the temporary to the full solution. */ 1161 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin()); 1162 1163 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1164 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1165 ierr = WaitForGPU();CHKERRCUSP(ierr); 1166 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1167 PetscFunctionReturn(0); 1168 } 1169 1170 #undef __FUNCT__ 1171 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering" 1172 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1173 { 1174 CUSPARRAY *xGPU,*bGPU; 1175 cusparseStatus_t stat; 1176 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1177 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1178 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1179 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1180 PetscErrorCode ierr; 1181 1182 PetscFunctionBegin; 1183 /* Get the GPU pointers */ 1184 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1185 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1186 1187 /* First, solve L */ 1188 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1189 loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1190 loTriFactor->csrMat->values->data().get(), 1191 loTriFactor->csrMat->row_offsets->data().get(), 1192 loTriFactor->csrMat->column_indices->data().get(), 1193 loTriFactor->solveInfo, 1194 bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1195 1196 /* Next, solve U */ 1197 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1198 upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1199 upTriFactor->csrMat->values->data().get(), 1200 upTriFactor->csrMat->row_offsets->data().get(), 1201 upTriFactor->csrMat->column_indices->data().get(), 1202 upTriFactor->solveInfo, 1203 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1204 1205 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1206 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1207 ierr = WaitForGPU();CHKERRCUSP(ierr); 1208 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1209 PetscFunctionReturn(0); 1210 } 1211 1212 #undef __FUNCT__ 1213 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU" 1214 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1215 { 1216 1217 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1218 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1219 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1220 PetscInt m = A->rmap->n,*ii,*ridx; 1221 PetscErrorCode ierr; 1222 cusparseStatus_t stat; 1223 cudaError_t err; 1224 1225 PetscFunctionBegin; 1226 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 1227 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1228 if (matstruct) { 1229 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Data structure should not be initialized."); 1230 } 1231 try { 1232 cusparsestruct->nonzerorow=0; 1233 for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0); 1234 1235 if (a->compressedrow.use) { 1236 m = a->compressedrow.nrows; 1237 ii = a->compressedrow.i; 1238 ridx = a->compressedrow.rindex; 1239 } else { 1240 /* Forcing compressed row on the GPU */ 1241 int k=0; 1242 ierr = PetscMalloc1((cusparsestruct->nonzerorow+1), &ii);CHKERRQ(ierr); 1243 ierr = PetscMalloc1((cusparsestruct->nonzerorow), &ridx);CHKERRQ(ierr); 1244 ii[0]=0; 1245 for (int j = 0; j<m; j++) { 1246 if ((a->i[j+1]-a->i[j])>0) { 1247 ii[k] = a->i[j]; 1248 ridx[k]= j; 1249 k++; 1250 } 1251 } 1252 ii[cusparsestruct->nonzerorow] = a->nz; 1253 m = cusparsestruct->nonzerorow; 1254 } 1255 1256 /* allocate space for the triangular factor information */ 1257 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1258 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSP(stat); 1259 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 1260 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat); 1261 1262 err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUSP(err); 1263 err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 1264 err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUSP(err); 1265 err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 1266 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat); 1267 1268 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1269 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1270 /* set the matrix */ 1271 CsrMatrix *matrix= new CsrMatrix; 1272 matrix->num_rows = m; 1273 matrix->num_cols = A->cmap->n; 1274 matrix->num_entries = a->nz; 1275 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1276 matrix->row_offsets->assign(ii, ii + m+1); 1277 1278 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1279 matrix->column_indices->assign(a->j, a->j+a->nz); 1280 1281 matrix->values = new THRUSTARRAY(a->nz); 1282 matrix->values->assign(a->a, a->a+a->nz); 1283 1284 /* assign the pointer */ 1285 matstruct->mat = matrix; 1286 1287 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1288 #if CUDA_VERSION>=4020 1289 CsrMatrix *matrix= new CsrMatrix; 1290 matrix->num_rows = m; 1291 matrix->num_cols = A->cmap->n; 1292 matrix->num_entries = a->nz; 1293 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1294 matrix->row_offsets->assign(ii, ii + m+1); 1295 1296 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1297 matrix->column_indices->assign(a->j, a->j+a->nz); 1298 1299 matrix->values = new THRUSTARRAY(a->nz); 1300 matrix->values->assign(a->a, a->a+a->nz); 1301 1302 cusparseHybMat_t hybMat; 1303 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat); 1304 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1305 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1306 stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1307 matstruct->descr, matrix->values->data().get(), 1308 matrix->row_offsets->data().get(), 1309 matrix->column_indices->data().get(), 1310 hybMat, 0, partition);CHKERRCUSP(stat); 1311 /* assign the pointer */ 1312 matstruct->mat = hybMat; 1313 1314 if (matrix) { 1315 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1316 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1317 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1318 delete (CsrMatrix*)matrix; 1319 } 1320 #endif 1321 } 1322 1323 /* assign the compressed row indices */ 1324 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1325 matstruct->cprowIndices->assign(ridx,ridx+m); 1326 1327 /* assign the pointer */ 1328 cusparsestruct->mat = matstruct; 1329 1330 if (!a->compressedrow.use) { 1331 ierr = PetscFree(ii);CHKERRQ(ierr); 1332 ierr = PetscFree(ridx);CHKERRQ(ierr); 1333 } 1334 cusparsestruct->workVector = new THRUSTARRAY; 1335 cusparsestruct->workVector->resize(m); 1336 } catch(char *ex) { 1337 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1338 } 1339 ierr = WaitForGPU();CHKERRCUSP(ierr); 1340 1341 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 1342 1343 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1344 } 1345 PetscFunctionReturn(0); 1346 } 1347 1348 #undef __FUNCT__ 1349 #define __FUNCT__ "MatCreateVecs_SeqAIJCUSPARSE" 1350 static PetscErrorCode MatCreateVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left) 1351 { 1352 PetscErrorCode ierr; 1353 PetscInt rbs,cbs; 1354 1355 PetscFunctionBegin; 1356 ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr); 1357 if (right) { 1358 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 1359 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 1360 ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr); 1361 ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr); 1362 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 1363 } 1364 if (left) { 1365 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 1366 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 1367 ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr); 1368 ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr); 1369 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 1370 } 1371 PetscFunctionReturn(0); 1372 } 1373 1374 struct VecCUSPPlusEquals 1375 { 1376 template <typename Tuple> 1377 __host__ __device__ 1378 void operator()(Tuple t) 1379 { 1380 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1381 } 1382 }; 1383 1384 #undef __FUNCT__ 1385 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE" 1386 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1387 { 1388 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1389 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1390 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1391 CUSPARRAY *xarray,*yarray; 1392 PetscErrorCode ierr; 1393 cusparseStatus_t stat; 1394 1395 PetscFunctionBegin; 1396 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1397 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1398 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1399 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1400 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1401 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1402 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1403 mat->num_rows, mat->num_cols, mat->num_entries, 1404 matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), 1405 mat->column_indices->data().get(), xarray->data().get(), matstruct->beta, 1406 yarray->data().get());CHKERRCUSP(stat); 1407 } else { 1408 #if CUDA_VERSION>=4020 1409 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1410 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1411 matstruct->alpha, matstruct->descr, hybMat, 1412 xarray->data().get(), matstruct->beta, 1413 yarray->data().get());CHKERRCUSP(stat); 1414 #endif 1415 } 1416 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1417 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1418 if (!cusparsestruct->stream) { 1419 ierr = WaitForGPU();CHKERRCUSP(ierr); 1420 } 1421 ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1422 PetscFunctionReturn(0); 1423 } 1424 1425 #undef __FUNCT__ 1426 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE" 1427 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1428 { 1429 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1430 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1431 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1432 CUSPARRAY *xarray,*yarray; 1433 PetscErrorCode ierr; 1434 cusparseStatus_t stat; 1435 1436 PetscFunctionBegin; 1437 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1438 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1439 if (!matstructT) { 1440 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1441 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1442 } 1443 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1444 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1445 1446 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1447 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1448 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1449 mat->num_rows, mat->num_cols, 1450 mat->num_entries, matstructT->alpha, matstructT->descr, 1451 mat->values->data().get(), mat->row_offsets->data().get(), 1452 mat->column_indices->data().get(), xarray->data().get(), matstructT->beta, 1453 yarray->data().get());CHKERRCUSP(stat); 1454 } else { 1455 #if CUDA_VERSION>=4020 1456 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1457 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1458 matstructT->alpha, matstructT->descr, hybMat, 1459 xarray->data().get(), matstructT->beta, 1460 yarray->data().get());CHKERRCUSP(stat); 1461 #endif 1462 } 1463 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1464 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1465 if (!cusparsestruct->stream) { 1466 ierr = WaitForGPU();CHKERRCUSP(ierr); 1467 } 1468 ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1469 PetscFunctionReturn(0); 1470 } 1471 1472 1473 #undef __FUNCT__ 1474 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 1475 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1476 { 1477 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1478 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1479 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1480 CUSPARRAY *xarray,*yarray,*zarray; 1481 PetscErrorCode ierr; 1482 cusparseStatus_t stat; 1483 1484 PetscFunctionBegin; 1485 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1486 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1487 try { 1488 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 1489 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1490 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 1491 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1492 1493 /* multiply add */ 1494 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1495 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1496 /* here we need to be careful to set the number of rows in the multiply to the 1497 number of compressed rows in the matrix ... which is equivalent to the 1498 size of the workVector */ 1499 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1500 mat->num_rows, mat->num_cols, 1501 mat->num_entries, matstruct->alpha, matstruct->descr, 1502 mat->values->data().get(), mat->row_offsets->data().get(), 1503 mat->column_indices->data().get(), xarray->data().get(), matstruct->beta, 1504 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1505 } else { 1506 #if CUDA_VERSION>=4020 1507 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1508 if (cusparsestruct->workVector->size()) { 1509 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1510 matstruct->alpha, matstruct->descr, hybMat, 1511 xarray->data().get(), matstruct->beta, 1512 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1513 } 1514 #endif 1515 } 1516 1517 /* scatter the data from the temporary into the full vector with a += operation */ 1518 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))), 1519 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1520 VecCUSPPlusEquals()); 1521 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1522 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 1523 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1524 1525 } catch(char *ex) { 1526 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1527 } 1528 ierr = WaitForGPU();CHKERRCUSP(ierr); 1529 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1530 PetscFunctionReturn(0); 1531 } 1532 1533 #undef __FUNCT__ 1534 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJCUSPARSE" 1535 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1536 { 1537 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1538 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1539 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1540 CUSPARRAY *xarray,*yarray,*zarray; 1541 PetscErrorCode ierr; 1542 cusparseStatus_t stat; 1543 1544 PetscFunctionBegin; 1545 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1546 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1547 if (!matstructT) { 1548 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1549 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1550 } 1551 1552 try { 1553 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 1554 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1555 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 1556 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1557 1558 /* multiply add with matrix transpose */ 1559 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1560 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1561 /* here we need to be careful to set the number of rows in the multiply to the 1562 number of compressed rows in the matrix ... which is equivalent to the 1563 size of the workVector */ 1564 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1565 mat->num_rows, mat->num_cols, 1566 mat->num_entries, matstructT->alpha, matstructT->descr, 1567 mat->values->data().get(), mat->row_offsets->data().get(), 1568 mat->column_indices->data().get(), xarray->data().get(), matstructT->beta, 1569 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1570 } else { 1571 #if CUDA_VERSION>=4020 1572 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1573 if (cusparsestruct->workVector->size()) { 1574 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1575 matstructT->alpha, matstructT->descr, hybMat, 1576 xarray->data().get(), matstructT->beta, 1577 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1578 } 1579 #endif 1580 } 1581 1582 /* scatter the data from the temporary into the full vector with a += operation */ 1583 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))), 1584 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1585 VecCUSPPlusEquals()); 1586 1587 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1588 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 1589 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1590 1591 } catch(char *ex) { 1592 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1593 } 1594 ierr = WaitForGPU();CHKERRCUSP(ierr); 1595 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1596 PetscFunctionReturn(0); 1597 } 1598 1599 #undef __FUNCT__ 1600 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE" 1601 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1602 { 1603 PetscErrorCode ierr; 1604 1605 PetscFunctionBegin; 1606 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1607 if (A->factortype==MAT_FACTOR_NONE) { 1608 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1609 } 1610 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1611 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1612 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1613 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1614 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1615 PetscFunctionReturn(0); 1616 } 1617 1618 /* --------------------------------------------------------------------------------*/ 1619 #undef __FUNCT__ 1620 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE" 1621 /*@ 1622 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1623 (the default parallel PETSc format). This matrix will ultimately pushed down 1624 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1625 assembly performance the user should preallocate the matrix storage by setting 1626 the parameter nz (or the array nnz). By setting these parameters accurately, 1627 performance during matrix assembly can be increased by more than a factor of 50. 1628 1629 Collective on MPI_Comm 1630 1631 Input Parameters: 1632 + comm - MPI communicator, set to PETSC_COMM_SELF 1633 . m - number of rows 1634 . n - number of columns 1635 . nz - number of nonzeros per row (same for all rows) 1636 - nnz - array containing the number of nonzeros in the various rows 1637 (possibly different for each row) or NULL 1638 1639 Output Parameter: 1640 . A - the matrix 1641 1642 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1643 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1644 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1645 1646 Notes: 1647 If nnz is given then nz is ignored 1648 1649 The AIJ format (also called the Yale sparse matrix format or 1650 compressed row storage), is fully compatible with standard Fortran 77 1651 storage. That is, the stored row and column indices can begin at 1652 either one (as in Fortran) or zero. See the users' manual for details. 1653 1654 Specify the preallocated storage with either nz or nnz (not both). 1655 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1656 allocation. For large problems you MUST preallocate memory or you 1657 will get TERRIBLE performance, see the users' manual chapter on matrices. 1658 1659 By default, this format uses inodes (identical nodes) when possible, to 1660 improve numerical efficiency of matrix-vector products and solves. We 1661 search for consecutive rows with the same nonzero structure, thereby 1662 reusing matrix information to achieve increased efficiency. 1663 1664 Level: intermediate 1665 1666 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1667 @*/ 1668 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1669 { 1670 PetscErrorCode ierr; 1671 1672 PetscFunctionBegin; 1673 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1674 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1675 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1676 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1677 PetscFunctionReturn(0); 1678 } 1679 1680 #undef __FUNCT__ 1681 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE" 1682 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1683 { 1684 PetscErrorCode ierr; 1685 cusparseStatus_t stat; 1686 cudaError_t err; 1687 PetscFunctionBegin; 1688 if (A->factortype==MAT_FACTOR_NONE) { 1689 try { 1690 if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) { 1691 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1692 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1693 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1694 1695 /* delete all the members in each of the data structures */ 1696 if (matstruct) { 1697 if (matstruct->descr) { stat = cusparseDestroyMatDescr(matstruct->descr);CHKERRCUSP(stat); } 1698 if (matstruct->mat) { 1699 if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1700 #if CUDA_VERSION>=4020 1701 cusparseHybMat_t hybMat = (cusparseHybMat_t) matstruct->mat; 1702 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat); 1703 #endif 1704 } else { 1705 CsrMatrix* mat = (CsrMatrix*)matstruct->mat; 1706 if (mat->values) delete (THRUSTARRAY*)mat->values; 1707 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1708 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1709 delete (CsrMatrix*)matstruct->mat; 1710 } 1711 } 1712 if (matstruct->alpha) { err=cudaFree(matstruct->alpha);CHKERRCUSP(err); } 1713 if (matstruct->beta) { err=cudaFree(matstruct->beta);CHKERRCUSP(err); } 1714 if (matstruct->cprowIndices) delete (THRUSTINTARRAY*)matstruct->cprowIndices; 1715 if (matstruct) delete (Mat_SeqAIJCUSPARSEMultStruct*)matstruct; 1716 } 1717 if (matstructT) { 1718 if (matstructT->descr) { stat = cusparseDestroyMatDescr(matstructT->descr);CHKERRCUSP(stat); } 1719 if (matstructT->mat) { 1720 if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1721 #if CUDA_VERSION>=4020 1722 cusparseHybMat_t hybMat = (cusparseHybMat_t) matstructT->mat; 1723 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat); 1724 #endif 1725 } else { 1726 CsrMatrix* matT = (CsrMatrix*)matstructT->mat; 1727 if (matT->values) delete (THRUSTARRAY*)matT->values; 1728 if (matT->column_indices) delete (THRUSTINTARRAY32*)matT->column_indices; 1729 if (matT->row_offsets) delete (THRUSTINTARRAY32*)matT->row_offsets; 1730 delete (CsrMatrix*)matstructT->mat; 1731 } 1732 } 1733 if (matstructT->alpha) { err=cudaFree(matstructT->alpha);CHKERRCUSP(err); } 1734 if (matstructT->beta) { err=cudaFree(matstructT->beta);CHKERRCUSP(err); } 1735 if (matstructT->cprowIndices) delete (THRUSTINTARRAY*)matstructT->cprowIndices; 1736 if (matstructT) delete (Mat_SeqAIJCUSPARSEMultStruct*)matstructT; 1737 } 1738 if (cusparsestruct->workVector) delete (THRUSTARRAY*)cusparsestruct->workVector; 1739 if (cusparsestruct->handle) { stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); } 1740 if (cusparsestruct) delete (Mat_SeqAIJCUSPARSE*)cusparsestruct; 1741 A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 1742 } else { 1743 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1744 if (cusparsestruct->handle) { stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); } 1745 if (cusparsestruct) delete (Mat_SeqAIJCUSPARSE*)cusparsestruct; 1746 } 1747 } catch(char *ex) { 1748 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1749 } 1750 } else { 1751 /* The triangular factors */ 1752 try { 1753 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1754 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1755 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1756 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1757 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1758 1759 /* delete all the members in each of the data structures */ 1760 if (loTriFactor) { 1761 if (loTriFactor->descr) { stat = cusparseDestroyMatDescr(loTriFactor->descr);CHKERRCUSP(stat); } 1762 if (loTriFactor->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(loTriFactor->solveInfo);CHKERRCUSP(stat); } 1763 if (loTriFactor->csrMat) { 1764 if (loTriFactor->csrMat->values) delete (THRUSTARRAY*)loTriFactor->csrMat->values; 1765 if (loTriFactor->csrMat->column_indices) delete (THRUSTINTARRAY32*)loTriFactor->csrMat->column_indices; 1766 if (loTriFactor->csrMat->row_offsets) delete (THRUSTINTARRAY32*)loTriFactor->csrMat->row_offsets; 1767 delete (CsrMatrix*)loTriFactor->csrMat; 1768 } 1769 if (loTriFactor) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)loTriFactor; 1770 } 1771 1772 if (upTriFactor) { 1773 if (upTriFactor->descr) { stat = cusparseDestroyMatDescr(upTriFactor->descr);CHKERRCUSP(stat); } 1774 if (upTriFactor->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(upTriFactor->solveInfo);CHKERRCUSP(stat); } 1775 if (upTriFactor->csrMat) { 1776 if (upTriFactor->csrMat->values) delete (THRUSTARRAY*)upTriFactor->csrMat->values; 1777 if (upTriFactor->csrMat->column_indices) delete (THRUSTINTARRAY32*)upTriFactor->csrMat->column_indices; 1778 if (upTriFactor->csrMat->row_offsets) delete (THRUSTINTARRAY32*)upTriFactor->csrMat->row_offsets; 1779 delete (CsrMatrix*)upTriFactor->csrMat; 1780 } 1781 if (upTriFactor) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)upTriFactor; 1782 } 1783 1784 if (loTriFactorT) { 1785 if (loTriFactorT->descr) { stat = cusparseDestroyMatDescr(loTriFactorT->descr);CHKERRCUSP(stat); } 1786 if (loTriFactorT->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(loTriFactorT->solveInfo);CHKERRCUSP(stat); } 1787 if (loTriFactorT->csrMat) { 1788 if (loTriFactorT->csrMat->values) delete (THRUSTARRAY*)loTriFactorT->csrMat->values; 1789 if (loTriFactorT->csrMat->column_indices) delete (THRUSTINTARRAY32*)loTriFactorT->csrMat->column_indices; 1790 if (loTriFactorT->csrMat->row_offsets) delete (THRUSTINTARRAY32*)loTriFactorT->csrMat->row_offsets; 1791 delete (CsrMatrix*)loTriFactorT->csrMat; 1792 } 1793 if (loTriFactorT) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)loTriFactorT; 1794 } 1795 1796 if (upTriFactorT) { 1797 if (upTriFactorT->descr) { stat = cusparseDestroyMatDescr(upTriFactorT->descr);CHKERRCUSP(stat); } 1798 if (upTriFactorT->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(upTriFactorT->solveInfo);CHKERRCUSP(stat); } 1799 if (upTriFactorT->csrMat) { 1800 if (upTriFactorT->csrMat->values) delete (THRUSTARRAY*)upTriFactorT->csrMat->values; 1801 if (upTriFactorT->csrMat->column_indices) delete (THRUSTINTARRAY32*)upTriFactorT->csrMat->column_indices; 1802 if (upTriFactorT->csrMat->row_offsets) delete (THRUSTINTARRAY32*)upTriFactorT->csrMat->row_offsets; 1803 delete (CsrMatrix*)upTriFactorT->csrMat; 1804 } 1805 if (upTriFactorT) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)upTriFactorT; 1806 } 1807 if (cusparseTriFactors->rpermIndices) delete (THRUSTINTARRAY*)cusparseTriFactors->rpermIndices; 1808 if (cusparseTriFactors->cpermIndices) delete (THRUSTINTARRAY*)cusparseTriFactors->cpermIndices; 1809 if (cusparseTriFactors->workVector) delete (THRUSTARRAY*)cusparseTriFactors->workVector; 1810 if (cusparseTriFactors->handle) { stat = cusparseDestroy(cusparseTriFactors->handle);CHKERRCUSP(stat); } 1811 if (cusparseTriFactors) delete (Mat_SeqAIJCUSPARSETriFactors*)cusparseTriFactors; 1812 } catch(char *ex) { 1813 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1814 } 1815 } 1816 /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */ 1817 A->spptr = 0; 1818 1819 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1820 PetscFunctionReturn(0); 1821 } 1822 1823 #undef __FUNCT__ 1824 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE" 1825 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 1826 { 1827 PetscErrorCode ierr; 1828 cusparseStatus_t stat; 1829 cusparseHandle_t handle=0; 1830 1831 PetscFunctionBegin; 1832 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 1833 if (B->factortype==MAT_FACTOR_NONE) { 1834 /* you cannot check the inode.use flag here since the matrix was just created. 1835 now build a GPU matrix data structure */ 1836 B->spptr = new Mat_SeqAIJCUSPARSE; 1837 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1838 ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0; 1839 ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0; 1840 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1841 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1842 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = 0; 1843 stat = cusparseCreate(&handle);CHKERRCUSP(stat); 1844 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle; 1845 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1846 } else { 1847 /* NEXT, set the pointers to the triangular factors */ 1848 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1849 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 1850 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1851 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0; 1852 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0; 1853 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0; 1854 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0; 1855 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0; 1856 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = 0; 1857 stat = cusparseCreate(&handle);CHKERRCUSP(stat); 1858 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle; 1859 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0; 1860 } 1861 1862 /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the 1863 default cusparse tri solve. Note the difference with the implementation in 1864 MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */ 1865 ierr = PetscObjectComposeFunction((PetscObject)B,"MatGetFactor_petsc_C",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr); 1866 1867 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1868 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1869 B->ops->getvecs = MatCreateVecs_SeqAIJCUSPARSE; 1870 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1871 B->ops->mult = MatMult_SeqAIJCUSPARSE; 1872 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1873 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1874 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1875 1876 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1877 1878 B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 1879 1880 ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1881 PetscFunctionReturn(0); 1882 } 1883 1884 /*M 1885 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1886 1887 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1888 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 1889 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 1890 1891 Options Database Keys: 1892 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1893 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1894 . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1895 1896 Level: beginner 1897 1898 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1899 M*/ 1900