1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 6 #include <petscconf.h> 7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 8 #include <../src/mat/impls/sbaij/seq/sbaij.h> 9 #include <../src/vec/vec/impls/dvecimpl.h> 10 #include <petsc-private/vecimpl.h> 11 #undef VecType 12 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 13 14 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 15 16 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 17 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 18 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 19 20 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 21 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 22 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 23 24 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 26 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 28 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat); 29 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 30 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 31 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 32 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 33 34 #undef __FUNCT__ 35 #define __FUNCT__ "MatCUSPARSESetStream" 36 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 37 { 38 cusparseStatus_t stat; 39 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 40 41 PetscFunctionBegin; 42 cusparsestruct->stream = stream; 43 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSP(stat); 44 PetscFunctionReturn(0); 45 } 46 47 #undef __FUNCT__ 48 #define __FUNCT__ "MatCUSPARSESetHandle" 49 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 50 { 51 cusparseStatus_t stat; 52 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 53 54 PetscFunctionBegin; 55 if (cusparsestruct->handle) 56 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); 57 cusparsestruct->handle = handle; 58 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat); 59 PetscFunctionReturn(0); 60 } 61 62 #undef __FUNCT__ 63 #define __FUNCT__ "MatCUSPARSEClearHandle" 64 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 65 { 66 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 67 PetscFunctionBegin; 68 if (cusparsestruct->handle) 69 cusparsestruct->handle = 0; 70 PetscFunctionReturn(0); 71 } 72 73 #undef __FUNCT__ 74 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse" 75 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type) 76 { 77 PetscFunctionBegin; 78 *type = MATSOLVERCUSPARSE; 79 PetscFunctionReturn(0); 80 } 81 82 /*MC 83 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 84 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 85 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 86 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 87 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 88 algorithms are not recommended. This class does NOT support direct solver operations. 89 90 Level: beginner 91 92 .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 93 M*/ 94 95 #undef __FUNCT__ 96 #define __FUNCT__ "MatGetFactor_seqaij_cusparse" 97 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B) 98 { 99 PetscErrorCode ierr; 100 PetscInt n = A->rmap->n; 101 102 PetscFunctionBegin; 103 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 104 (*B)->factortype = ftype; 105 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 106 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 107 108 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 109 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 110 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 111 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 112 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 113 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 114 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 115 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 116 117 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 118 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr); 119 PetscFunctionReturn(0); 120 } 121 122 #undef __FUNCT__ 123 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE" 124 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 125 { 126 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 127 128 PetscFunctionBegin; 129 #if CUDA_VERSION>=4020 130 switch (op) { 131 case MAT_CUSPARSE_MULT: 132 cusparsestruct->format = format; 133 break; 134 case MAT_CUSPARSE_ALL: 135 cusparsestruct->format = format; 136 break; 137 default: 138 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 139 } 140 #else 141 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) 142 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later."); 143 #endif 144 PetscFunctionReturn(0); 145 } 146 147 /*@ 148 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 149 operation. Only the MatMult operation can use different GPU storage formats 150 for MPIAIJCUSPARSE matrices. 151 Not Collective 152 153 Input Parameters: 154 + A - Matrix of type SEQAIJCUSPARSE 155 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 156 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 157 158 Output Parameter: 159 160 Level: intermediate 161 162 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 163 @*/ 164 #undef __FUNCT__ 165 #define __FUNCT__ "MatCUSPARSESetFormat" 166 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 167 { 168 PetscErrorCode ierr; 169 170 PetscFunctionBegin; 171 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 172 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 173 PetscFunctionReturn(0); 174 } 175 176 #undef __FUNCT__ 177 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE" 178 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A) 179 { 180 PetscErrorCode ierr; 181 MatCUSPARSEStorageFormat format; 182 PetscBool flg; 183 184 PetscFunctionBegin; 185 ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr); 186 ierr = PetscObjectOptionsBegin((PetscObject)A); 187 if (A->factortype==MAT_FACTOR_NONE) { 188 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 189 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 190 if (flg) { 191 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 192 } 193 } 194 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 195 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 196 if (flg) { 197 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 198 } 199 ierr = PetscOptionsEnd();CHKERRQ(ierr); 200 PetscFunctionReturn(0); 201 202 } 203 204 #undef __FUNCT__ 205 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE" 206 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 207 { 208 PetscErrorCode ierr; 209 210 PetscFunctionBegin; 211 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 212 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 213 PetscFunctionReturn(0); 214 } 215 216 #undef __FUNCT__ 217 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE" 218 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 219 { 220 PetscErrorCode ierr; 221 222 PetscFunctionBegin; 223 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 224 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 225 PetscFunctionReturn(0); 226 } 227 228 #undef __FUNCT__ 229 #define __FUNCT__ "MatICCFactorSymbolic_SeqAIJCUSPARSE" 230 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 231 { 232 PetscErrorCode ierr; 233 234 PetscFunctionBegin; 235 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 236 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 237 PetscFunctionReturn(0); 238 } 239 240 #undef __FUNCT__ 241 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqAIJCUSPARSE" 242 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 243 { 244 PetscErrorCode ierr; 245 246 PetscFunctionBegin; 247 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 248 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 249 PetscFunctionReturn(0); 250 } 251 252 #undef __FUNCT__ 253 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILULowerTriMatrix" 254 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 255 { 256 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 257 PetscInt n = A->rmap->n; 258 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 259 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 260 cusparseStatus_t stat; 261 const PetscInt *ai = a->i,*aj = a->j,*vi; 262 const MatScalar *aa = a->a,*v; 263 PetscInt *AiLo, *AjLo; 264 PetscScalar *AALo; 265 PetscInt i,nz, nzLower, offset, rowOffset; 266 PetscErrorCode ierr; 267 268 PetscFunctionBegin; 269 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 270 try { 271 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 272 nzLower=n+ai[n]-ai[1]; 273 274 /* Allocate Space for the lower triangular matrix */ 275 ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 276 ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr); 277 ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr); 278 279 /* Fill the lower triangular matrix */ 280 AiLo[0] = (PetscInt) 0; 281 AiLo[n] = nzLower; 282 AjLo[0] = (PetscInt) 0; 283 AALo[0] = (MatScalar) 1.0; 284 v = aa; 285 vi = aj; 286 offset = 1; 287 rowOffset= 1; 288 for (i=1; i<n; i++) { 289 nz = ai[i+1] - ai[i]; 290 /* additional 1 for the term on the diagonal */ 291 AiLo[i] = rowOffset; 292 rowOffset += nz+1; 293 294 ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 295 ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 296 297 offset += nz; 298 AjLo[offset] = (PetscInt) i; 299 AALo[offset] = (MatScalar) 1.0; 300 offset += 1; 301 302 v += nz; 303 vi += nz; 304 } 305 306 /* allocate space for the triangular factor information */ 307 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 308 309 /* Create the matrix description */ 310 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat); 311 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 312 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 313 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat); 314 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat); 315 316 /* Create the solve analysis information */ 317 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat); 318 319 /* set the operation */ 320 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 321 322 /* set the matrix */ 323 loTriFactor->csrMat = new CsrMatrix; 324 loTriFactor->csrMat->num_rows = n; 325 loTriFactor->csrMat->num_cols = n; 326 loTriFactor->csrMat->num_entries = nzLower; 327 328 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 329 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 330 331 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 332 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 333 334 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 335 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 336 337 /* perform the solve analysis */ 338 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 339 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 340 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 341 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat); 342 343 /* assign the pointer. Is this really necessary? */ 344 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 345 346 ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr); 347 ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr); 348 ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 349 } catch(char *ex) { 350 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 351 } 352 } 353 PetscFunctionReturn(0); 354 } 355 356 #undef __FUNCT__ 357 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILUUpperTriMatrix" 358 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 359 { 360 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 361 PetscInt n = A->rmap->n; 362 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 363 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 364 cusparseStatus_t stat; 365 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 366 const MatScalar *aa = a->a,*v; 367 PetscInt *AiUp, *AjUp; 368 PetscScalar *AAUp; 369 PetscInt i,nz, nzUpper, offset; 370 PetscErrorCode ierr; 371 372 PetscFunctionBegin; 373 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 374 try { 375 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 376 nzUpper = adiag[0]-adiag[n]; 377 378 /* Allocate Space for the upper triangular matrix */ 379 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 380 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 381 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 382 383 /* Fill the upper triangular matrix */ 384 AiUp[0]=(PetscInt) 0; 385 AiUp[n]=nzUpper; 386 offset = nzUpper; 387 for (i=n-1; i>=0; i--) { 388 v = aa + adiag[i+1] + 1; 389 vi = aj + adiag[i+1] + 1; 390 391 /* number of elements NOT on the diagonal */ 392 nz = adiag[i] - adiag[i+1]-1; 393 394 /* decrement the offset */ 395 offset -= (nz+1); 396 397 /* first, set the diagonal elements */ 398 AjUp[offset] = (PetscInt) i; 399 AAUp[offset] = 1./v[nz]; 400 AiUp[i] = AiUp[i+1] - (nz+1); 401 402 ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 403 ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 404 } 405 406 /* allocate space for the triangular factor information */ 407 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 408 409 /* Create the matrix description */ 410 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat); 411 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 412 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 413 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 414 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 415 416 /* Create the solve analysis information */ 417 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat); 418 419 /* set the operation */ 420 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 421 422 /* set the matrix */ 423 upTriFactor->csrMat = new CsrMatrix; 424 upTriFactor->csrMat->num_rows = n; 425 upTriFactor->csrMat->num_cols = n; 426 upTriFactor->csrMat->num_entries = nzUpper; 427 428 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 429 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 430 431 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 432 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 433 434 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 435 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 436 437 /* perform the solve analysis */ 438 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 439 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 440 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 441 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat); 442 443 /* assign the pointer. Is this really necessary? */ 444 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 445 446 ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 447 ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 448 ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 449 } catch(char *ex) { 450 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 451 } 452 } 453 PetscFunctionReturn(0); 454 } 455 456 #undef __FUNCT__ 457 #define __FUNCT__ "MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU" 458 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 459 { 460 PetscErrorCode ierr; 461 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 462 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 463 IS isrow = a->row,iscol = a->icol; 464 PetscBool row_identity,col_identity; 465 const PetscInt *r,*c; 466 PetscInt n = A->rmap->n; 467 468 PetscFunctionBegin; 469 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 470 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 471 472 cusparseTriFactors->workVector = new THRUSTARRAY; 473 cusparseTriFactors->workVector->resize(n); 474 cusparseTriFactors->nnz=a->nz; 475 476 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 477 /*lower triangular indices */ 478 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 479 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 480 if (!row_identity) { 481 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 482 cusparseTriFactors->rpermIndices->assign(r, r+n); 483 } 484 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 485 486 /*upper triangular indices */ 487 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 488 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 489 if (!col_identity) { 490 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 491 cusparseTriFactors->cpermIndices->assign(c, c+n); 492 } 493 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 494 PetscFunctionReturn(0); 495 } 496 497 #undef __FUNCT__ 498 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildICCTriMatrices" 499 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 500 { 501 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 502 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 503 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 504 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 505 cusparseStatus_t stat; 506 PetscErrorCode ierr; 507 PetscInt *AiUp, *AjUp; 508 PetscScalar *AAUp; 509 PetscScalar *AALo; 510 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 511 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 512 const PetscInt *ai = b->i,*aj = b->j,*vj; 513 const MatScalar *aa = b->a,*v; 514 515 PetscFunctionBegin; 516 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 517 try { 518 /* Allocate Space for the upper triangular matrix */ 519 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 520 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 521 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 522 ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 523 524 /* Fill the upper triangular matrix */ 525 AiUp[0]=(PetscInt) 0; 526 AiUp[n]=nzUpper; 527 offset = 0; 528 for (i=0; i<n; i++) { 529 /* set the pointers */ 530 v = aa + ai[i]; 531 vj = aj + ai[i]; 532 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 533 534 /* first, set the diagonal elements */ 535 AjUp[offset] = (PetscInt) i; 536 AAUp[offset] = 1.0/v[nz]; 537 AiUp[i] = offset; 538 AALo[offset] = 1.0/v[nz]; 539 540 offset+=1; 541 if (nz>0) { 542 ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr); 543 ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 544 for (j=offset; j<offset+nz; j++) { 545 AAUp[j] = -AAUp[j]; 546 AALo[j] = AAUp[j]/v[nz]; 547 } 548 offset+=nz; 549 } 550 } 551 552 /* allocate space for the triangular factor information */ 553 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 554 555 /* Create the matrix description */ 556 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat); 557 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 558 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 559 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 560 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat); 561 562 /* Create the solve analysis information */ 563 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat); 564 565 /* set the operation */ 566 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 567 568 /* set the matrix */ 569 upTriFactor->csrMat = new CsrMatrix; 570 upTriFactor->csrMat->num_rows = A->rmap->n; 571 upTriFactor->csrMat->num_cols = A->cmap->n; 572 upTriFactor->csrMat->num_entries = a->nz; 573 574 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 575 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 576 577 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 578 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 579 580 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 581 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 582 583 /* perform the solve analysis */ 584 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 585 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 586 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 587 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat); 588 589 /* assign the pointer. Is this really necessary? */ 590 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 591 592 /* allocate space for the triangular factor information */ 593 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 594 595 /* Create the matrix description */ 596 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat); 597 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 598 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 599 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 600 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 601 602 /* Create the solve analysis information */ 603 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat); 604 605 /* set the operation */ 606 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 607 608 /* set the matrix */ 609 loTriFactor->csrMat = new CsrMatrix; 610 loTriFactor->csrMat->num_rows = A->rmap->n; 611 loTriFactor->csrMat->num_cols = A->cmap->n; 612 loTriFactor->csrMat->num_entries = a->nz; 613 614 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 615 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 616 617 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 618 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 619 620 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 621 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 622 623 /* perform the solve analysis */ 624 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 625 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 626 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 627 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat); 628 629 /* assign the pointer. Is this really necessary? */ 630 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 631 632 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 633 ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 634 ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 635 ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 636 ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 637 } catch(char *ex) { 638 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 639 } 640 } 641 PetscFunctionReturn(0); 642 } 643 644 #undef __FUNCT__ 645 #define __FUNCT__ "MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU" 646 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 647 { 648 PetscErrorCode ierr; 649 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 650 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 651 IS ip = a->row; 652 const PetscInt *rip; 653 PetscBool perm_identity; 654 PetscInt n = A->rmap->n; 655 656 PetscFunctionBegin; 657 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 658 cusparseTriFactors->workVector = new THRUSTARRAY; 659 cusparseTriFactors->workVector->resize(n); 660 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 661 662 /*lower triangular indices */ 663 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 664 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 665 if (!perm_identity) { 666 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 667 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 668 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 669 cusparseTriFactors->cpermIndices->assign(rip, rip+n); 670 } 671 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 672 PetscFunctionReturn(0); 673 } 674 675 #undef __FUNCT__ 676 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE" 677 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 678 { 679 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 680 IS isrow = b->row,iscol = b->col; 681 PetscBool row_identity,col_identity; 682 PetscErrorCode ierr; 683 684 PetscFunctionBegin; 685 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 686 /* determine which version of MatSolve needs to be used. */ 687 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 688 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 689 if (row_identity && col_identity) { 690 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 691 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 692 } else { 693 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 694 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 695 } 696 697 /* get the triangular factors */ 698 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 699 PetscFunctionReturn(0); 700 } 701 702 #undef __FUNCT__ 703 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqAIJCUSPARSE" 704 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 705 { 706 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 707 IS ip = b->row; 708 PetscBool perm_identity; 709 PetscErrorCode ierr; 710 711 PetscFunctionBegin; 712 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 713 714 /* determine which version of MatSolve needs to be used. */ 715 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 716 if (perm_identity) { 717 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 718 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 719 } else { 720 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 721 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 722 } 723 724 /* get the triangular factors */ 725 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 726 PetscFunctionReturn(0); 727 } 728 729 #undef __FUNCT__ 730 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve" 731 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 732 { 733 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 734 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 735 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 736 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 737 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 738 cusparseStatus_t stat; 739 cusparseIndexBase_t indexBase; 740 cusparseMatrixType_t matrixType; 741 cusparseFillMode_t fillMode; 742 cusparseDiagType_t diagType; 743 744 PetscFunctionBegin; 745 746 /*********************************************/ 747 /* Now the Transpose of the Lower Tri Factor */ 748 /*********************************************/ 749 750 /* allocate space for the transpose of the lower triangular factor */ 751 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 752 753 /* set the matrix descriptors of the lower triangular factor */ 754 matrixType = cusparseGetMatType(loTriFactor->descr); 755 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 756 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 757 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 758 diagType = cusparseGetMatDiagType(loTriFactor->descr); 759 760 /* Create the matrix description */ 761 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSP(stat); 762 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSP(stat); 763 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSP(stat); 764 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSP(stat); 765 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSP(stat); 766 767 /* Create the solve analysis information */ 768 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSP(stat); 769 770 /* set the operation */ 771 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 772 773 /* allocate GPU space for the CSC of the lower triangular factor*/ 774 loTriFactorT->csrMat = new CsrMatrix; 775 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 776 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 777 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 778 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 779 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 780 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 781 782 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 783 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 784 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 785 loTriFactor->csrMat->values->data().get(), 786 loTriFactor->csrMat->row_offsets->data().get(), 787 loTriFactor->csrMat->column_indices->data().get(), 788 loTriFactorT->csrMat->values->data().get(), 789 loTriFactorT->csrMat->column_indices->data().get(), 790 loTriFactorT->csrMat->row_offsets->data().get(), 791 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 792 793 /* perform the solve analysis on the transposed matrix */ 794 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 795 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 796 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 797 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 798 loTriFactorT->solveInfo);CHKERRCUSP(stat); 799 800 /* assign the pointer. Is this really necessary? */ 801 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 802 803 /*********************************************/ 804 /* Now the Transpose of the Upper Tri Factor */ 805 /*********************************************/ 806 807 /* allocate space for the transpose of the upper triangular factor */ 808 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 809 810 /* set the matrix descriptors of the upper triangular factor */ 811 matrixType = cusparseGetMatType(upTriFactor->descr); 812 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 813 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 814 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 815 diagType = cusparseGetMatDiagType(upTriFactor->descr); 816 817 /* Create the matrix description */ 818 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSP(stat); 819 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSP(stat); 820 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSP(stat); 821 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSP(stat); 822 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSP(stat); 823 824 /* Create the solve analysis information */ 825 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSP(stat); 826 827 /* set the operation */ 828 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 829 830 /* allocate GPU space for the CSC of the upper triangular factor*/ 831 upTriFactorT->csrMat = new CsrMatrix; 832 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 833 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 834 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 835 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 836 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 837 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 838 839 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 840 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 841 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 842 upTriFactor->csrMat->values->data().get(), 843 upTriFactor->csrMat->row_offsets->data().get(), 844 upTriFactor->csrMat->column_indices->data().get(), 845 upTriFactorT->csrMat->values->data().get(), 846 upTriFactorT->csrMat->column_indices->data().get(), 847 upTriFactorT->csrMat->row_offsets->data().get(), 848 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 849 850 /* perform the solve analysis on the transposed matrix */ 851 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 852 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 853 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 854 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 855 upTriFactorT->solveInfo);CHKERRCUSP(stat); 856 857 /* assign the pointer. Is this really necessary? */ 858 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 859 PetscFunctionReturn(0); 860 } 861 862 #undef __FUNCT__ 863 #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult" 864 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 865 { 866 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 867 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 868 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 869 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 870 cusparseStatus_t stat; 871 cusparseIndexBase_t indexBase; 872 cudaError_t err; 873 874 PetscFunctionBegin; 875 876 /* allocate space for the triangular factor information */ 877 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 878 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSP(stat); 879 indexBase = cusparseGetMatIndexBase(matstruct->descr); 880 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSP(stat); 881 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat); 882 883 /* set alpha and beta */ 884 err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUSP(err); 885 err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 886 err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUSP(err); 887 err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 888 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat); 889 890 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 891 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 892 CsrMatrix *matrixT= new CsrMatrix; 893 matrixT->num_rows = A->rmap->n; 894 matrixT->num_cols = A->cmap->n; 895 matrixT->num_entries = a->nz; 896 matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 897 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 898 matrixT->values = new THRUSTARRAY(a->nz); 899 900 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 901 indexBase = cusparseGetMatIndexBase(matstruct->descr); 902 stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows, 903 matrix->num_cols, matrix->num_entries, 904 matrix->values->data().get(), 905 matrix->row_offsets->data().get(), 906 matrix->column_indices->data().get(), 907 matrixT->values->data().get(), 908 matrixT->column_indices->data().get(), 909 matrixT->row_offsets->data().get(), 910 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 911 912 /* assign the pointer */ 913 matstructT->mat = matrixT; 914 915 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 916 #if CUDA_VERSION>=5000 917 /* First convert HYB to CSR */ 918 CsrMatrix *temp= new CsrMatrix; 919 temp->num_rows = A->rmap->n; 920 temp->num_cols = A->cmap->n; 921 temp->num_entries = a->nz; 922 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 923 temp->column_indices = new THRUSTINTARRAY32(a->nz); 924 temp->values = new THRUSTARRAY(a->nz); 925 926 927 stat = cusparse_hyb2csr(cusparsestruct->handle, 928 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 929 temp->values->data().get(), 930 temp->row_offsets->data().get(), 931 temp->column_indices->data().get());CHKERRCUSP(stat); 932 933 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 934 CsrMatrix *tempT= new CsrMatrix; 935 tempT->num_rows = A->rmap->n; 936 tempT->num_cols = A->cmap->n; 937 tempT->num_entries = a->nz; 938 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 939 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 940 tempT->values = new THRUSTARRAY(a->nz); 941 942 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 943 temp->num_cols, temp->num_entries, 944 temp->values->data().get(), 945 temp->row_offsets->data().get(), 946 temp->column_indices->data().get(), 947 tempT->values->data().get(), 948 tempT->column_indices->data().get(), 949 tempT->row_offsets->data().get(), 950 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 951 952 /* Last, convert CSC to HYB */ 953 cusparseHybMat_t hybMat; 954 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat); 955 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 956 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 957 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 958 matstructT->descr, tempT->values->data().get(), 959 tempT->row_offsets->data().get(), 960 tempT->column_indices->data().get(), 961 hybMat, 0, partition);CHKERRCUSP(stat); 962 963 /* assign the pointer */ 964 matstructT->mat = hybMat; 965 966 /* delete temporaries */ 967 if (tempT) { 968 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 969 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 970 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 971 delete (CsrMatrix*) tempT; 972 } 973 if (temp) { 974 if (temp->values) delete (THRUSTARRAY*) temp->values; 975 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 976 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 977 delete (CsrMatrix*) temp; 978 } 979 #else 980 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later."); 981 #endif 982 } 983 /* assign the compressed row indices */ 984 matstructT->cprowIndices = new THRUSTINTARRAY; 985 986 /* assign the pointer */ 987 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 988 PetscFunctionReturn(0); 989 } 990 991 #undef __FUNCT__ 992 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE" 993 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 994 { 995 CUSPARRAY *xGPU, *bGPU; 996 cusparseStatus_t stat; 997 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 998 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 999 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1000 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1001 PetscErrorCode ierr; 1002 1003 PetscFunctionBegin; 1004 /* Analyze the matrix and create the transpose ... on the fly */ 1005 if (!loTriFactorT && !upTriFactorT) { 1006 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1007 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1008 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1009 } 1010 1011 /* Get the GPU pointers */ 1012 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1013 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1014 1015 /* First, reorder with the row permutation */ 1016 thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()), 1017 thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()), 1018 xGPU->begin()); 1019 1020 /* First, solve U */ 1021 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1022 upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 1023 upTriFactorT->csrMat->values->data().get(), 1024 upTriFactorT->csrMat->row_offsets->data().get(), 1025 upTriFactorT->csrMat->column_indices->data().get(), 1026 upTriFactorT->solveInfo, 1027 xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1028 1029 /* Then, solve L */ 1030 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1031 loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 1032 loTriFactorT->csrMat->values->data().get(), 1033 loTriFactorT->csrMat->row_offsets->data().get(), 1034 loTriFactorT->csrMat->column_indices->data().get(), 1035 loTriFactorT->solveInfo, 1036 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1037 1038 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1039 thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1040 thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()), 1041 tempGPU->begin()); 1042 1043 /* Copy the temporary to the full solution. */ 1044 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin()); 1045 1046 /* restore */ 1047 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1048 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1049 ierr = WaitForGPU();CHKERRCUSP(ierr); 1050 1051 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1052 PetscFunctionReturn(0); 1053 } 1054 1055 #undef __FUNCT__ 1056 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering" 1057 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1058 { 1059 CUSPARRAY *xGPU,*bGPU; 1060 cusparseStatus_t stat; 1061 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1062 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1063 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1064 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1065 PetscErrorCode ierr; 1066 1067 PetscFunctionBegin; 1068 /* Analyze the matrix and create the transpose ... on the fly */ 1069 if (!loTriFactorT && !upTriFactorT) { 1070 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1071 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1072 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1073 } 1074 1075 /* Get the GPU pointers */ 1076 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1077 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1078 1079 /* First, solve U */ 1080 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1081 upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 1082 upTriFactorT->csrMat->values->data().get(), 1083 upTriFactorT->csrMat->row_offsets->data().get(), 1084 upTriFactorT->csrMat->column_indices->data().get(), 1085 upTriFactorT->solveInfo, 1086 bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1087 1088 /* Then, solve L */ 1089 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1090 loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 1091 loTriFactorT->csrMat->values->data().get(), 1092 loTriFactorT->csrMat->row_offsets->data().get(), 1093 loTriFactorT->csrMat->column_indices->data().get(), 1094 loTriFactorT->solveInfo, 1095 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1096 1097 /* restore */ 1098 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1099 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1100 ierr = WaitForGPU();CHKERRCUSP(ierr); 1101 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1102 PetscFunctionReturn(0); 1103 } 1104 1105 #undef __FUNCT__ 1106 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE" 1107 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1108 { 1109 CUSPARRAY *xGPU,*bGPU; 1110 cusparseStatus_t stat; 1111 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1112 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1113 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1114 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1115 PetscErrorCode ierr; 1116 VecType t; 1117 PetscBool flg; 1118 1119 PetscFunctionBegin; 1120 ierr = VecGetType(bb,&t);CHKERRQ(ierr); 1121 ierr = PetscStrcmp(t,VECSEQCUSP,&flg);CHKERRQ(ierr); 1122 if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #2). Can only deal with %s\n.",t,VECSEQCUSP); 1123 ierr = VecGetType(xx,&t);CHKERRQ(ierr); 1124 ierr = PetscStrcmp(t,VECSEQCUSP,&flg);CHKERRQ(ierr); 1125 if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #3). Can only deal with %s\n.",t,VECSEQCUSP); 1126 1127 /* Get the GPU pointers */ 1128 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1129 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1130 1131 /* First, reorder with the row permutation */ 1132 thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()), 1133 thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()), 1134 xGPU->begin()); 1135 1136 /* Next, solve L */ 1137 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1138 loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1139 loTriFactor->csrMat->values->data().get(), 1140 loTriFactor->csrMat->row_offsets->data().get(), 1141 loTriFactor->csrMat->column_indices->data().get(), 1142 loTriFactor->solveInfo, 1143 xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1144 1145 /* Then, solve U */ 1146 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1147 upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1148 upTriFactor->csrMat->values->data().get(), 1149 upTriFactor->csrMat->row_offsets->data().get(), 1150 upTriFactor->csrMat->column_indices->data().get(), 1151 upTriFactor->solveInfo, 1152 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1153 1154 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1155 thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1156 thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()), 1157 tempGPU->begin()); 1158 1159 /* Copy the temporary to the full solution. */ 1160 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin()); 1161 1162 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1163 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1164 ierr = WaitForGPU();CHKERRCUSP(ierr); 1165 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1166 PetscFunctionReturn(0); 1167 } 1168 1169 #undef __FUNCT__ 1170 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering" 1171 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1172 { 1173 CUSPARRAY *xGPU,*bGPU; 1174 cusparseStatus_t stat; 1175 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1176 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1177 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1178 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1179 PetscErrorCode ierr; 1180 1181 PetscFunctionBegin; 1182 /* Get the GPU pointers */ 1183 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1184 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1185 1186 /* First, solve L */ 1187 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1188 loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1189 loTriFactor->csrMat->values->data().get(), 1190 loTriFactor->csrMat->row_offsets->data().get(), 1191 loTriFactor->csrMat->column_indices->data().get(), 1192 loTriFactor->solveInfo, 1193 bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1194 1195 /* Next, solve U */ 1196 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1197 upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1198 upTriFactor->csrMat->values->data().get(), 1199 upTriFactor->csrMat->row_offsets->data().get(), 1200 upTriFactor->csrMat->column_indices->data().get(), 1201 upTriFactor->solveInfo, 1202 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1203 1204 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1205 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1206 ierr = WaitForGPU();CHKERRCUSP(ierr); 1207 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1208 PetscFunctionReturn(0); 1209 } 1210 1211 #undef __FUNCT__ 1212 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU" 1213 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1214 { 1215 1216 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1217 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1218 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1219 PetscInt m = A->rmap->n,*ii,*ridx; 1220 PetscErrorCode ierr; 1221 cusparseStatus_t stat; 1222 cudaError_t err; 1223 1224 PetscFunctionBegin; 1225 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 1226 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1227 if (matstruct) { 1228 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Data structure should not be initialized."); 1229 } 1230 try { 1231 cusparsestruct->nonzerorow=0; 1232 for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0); 1233 1234 if (a->compressedrow.use) { 1235 m = a->compressedrow.nrows; 1236 ii = a->compressedrow.i; 1237 ridx = a->compressedrow.rindex; 1238 } else { 1239 /* Forcing compressed row on the GPU */ 1240 int k=0; 1241 ierr = PetscMalloc1((cusparsestruct->nonzerorow+1), &ii);CHKERRQ(ierr); 1242 ierr = PetscMalloc1((cusparsestruct->nonzerorow), &ridx);CHKERRQ(ierr); 1243 ii[0]=0; 1244 for (int j = 0; j<m; j++) { 1245 if ((a->i[j+1]-a->i[j])>0) { 1246 ii[k] = a->i[j]; 1247 ridx[k]= j; 1248 k++; 1249 } 1250 } 1251 ii[cusparsestruct->nonzerorow] = a->nz; 1252 m = cusparsestruct->nonzerorow; 1253 } 1254 1255 /* allocate space for the triangular factor information */ 1256 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1257 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSP(stat); 1258 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 1259 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat); 1260 1261 err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUSP(err); 1262 err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 1263 err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUSP(err); 1264 err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 1265 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat); 1266 1267 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1268 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1269 /* set the matrix */ 1270 CsrMatrix *matrix= new CsrMatrix; 1271 matrix->num_rows = m; 1272 matrix->num_cols = A->cmap->n; 1273 matrix->num_entries = a->nz; 1274 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1275 matrix->row_offsets->assign(ii, ii + m+1); 1276 1277 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1278 matrix->column_indices->assign(a->j, a->j+a->nz); 1279 1280 matrix->values = new THRUSTARRAY(a->nz); 1281 matrix->values->assign(a->a, a->a+a->nz); 1282 1283 /* assign the pointer */ 1284 matstruct->mat = matrix; 1285 1286 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1287 #if CUDA_VERSION>=4020 1288 CsrMatrix *matrix= new CsrMatrix; 1289 matrix->num_rows = m; 1290 matrix->num_cols = A->cmap->n; 1291 matrix->num_entries = a->nz; 1292 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1293 matrix->row_offsets->assign(ii, ii + m+1); 1294 1295 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1296 matrix->column_indices->assign(a->j, a->j+a->nz); 1297 1298 matrix->values = new THRUSTARRAY(a->nz); 1299 matrix->values->assign(a->a, a->a+a->nz); 1300 1301 cusparseHybMat_t hybMat; 1302 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat); 1303 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1304 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1305 stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1306 matstruct->descr, matrix->values->data().get(), 1307 matrix->row_offsets->data().get(), 1308 matrix->column_indices->data().get(), 1309 hybMat, 0, partition);CHKERRCUSP(stat); 1310 /* assign the pointer */ 1311 matstruct->mat = hybMat; 1312 1313 if (matrix) { 1314 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1315 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1316 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1317 delete (CsrMatrix*)matrix; 1318 } 1319 #endif 1320 } 1321 1322 /* assign the compressed row indices */ 1323 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1324 matstruct->cprowIndices->assign(ridx,ridx+m); 1325 1326 /* assign the pointer */ 1327 cusparsestruct->mat = matstruct; 1328 1329 if (!a->compressedrow.use) { 1330 ierr = PetscFree(ii);CHKERRQ(ierr); 1331 ierr = PetscFree(ridx);CHKERRQ(ierr); 1332 } 1333 cusparsestruct->workVector = new THRUSTARRAY; 1334 cusparsestruct->workVector->resize(m); 1335 } catch(char *ex) { 1336 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1337 } 1338 ierr = WaitForGPU();CHKERRCUSP(ierr); 1339 1340 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 1341 1342 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1343 } 1344 PetscFunctionReturn(0); 1345 } 1346 1347 #undef __FUNCT__ 1348 #define __FUNCT__ "MatCreateVecs_SeqAIJCUSPARSE" 1349 static PetscErrorCode MatCreateVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left) 1350 { 1351 PetscErrorCode ierr; 1352 PetscInt rbs,cbs; 1353 1354 PetscFunctionBegin; 1355 ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr); 1356 if (right) { 1357 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 1358 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 1359 ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr); 1360 ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr); 1361 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 1362 } 1363 if (left) { 1364 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 1365 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 1366 ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr); 1367 ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr); 1368 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 1369 } 1370 PetscFunctionReturn(0); 1371 } 1372 1373 struct VecCUSPPlusEquals 1374 { 1375 template <typename Tuple> 1376 __host__ __device__ 1377 void operator()(Tuple t) 1378 { 1379 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1380 } 1381 }; 1382 1383 #undef __FUNCT__ 1384 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE" 1385 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1386 { 1387 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1388 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1389 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1390 CUSPARRAY *xarray,*yarray; 1391 PetscErrorCode ierr; 1392 cusparseStatus_t stat; 1393 1394 PetscFunctionBegin; 1395 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1396 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1397 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1398 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1399 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1400 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1401 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1402 mat->num_rows, mat->num_cols, mat->num_entries, 1403 matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), 1404 mat->column_indices->data().get(), xarray->data().get(), matstruct->beta, 1405 yarray->data().get());CHKERRCUSP(stat); 1406 } else { 1407 #if CUDA_VERSION>=4020 1408 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1409 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1410 matstruct->alpha, matstruct->descr, hybMat, 1411 xarray->data().get(), matstruct->beta, 1412 yarray->data().get());CHKERRCUSP(stat); 1413 #endif 1414 } 1415 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1416 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1417 if (!cusparsestruct->stream) { 1418 ierr = WaitForGPU();CHKERRCUSP(ierr); 1419 } 1420 ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1421 PetscFunctionReturn(0); 1422 } 1423 1424 #undef __FUNCT__ 1425 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE" 1426 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1427 { 1428 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1429 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1430 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1431 CUSPARRAY *xarray,*yarray; 1432 PetscErrorCode ierr; 1433 cusparseStatus_t stat; 1434 1435 PetscFunctionBegin; 1436 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1437 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1438 if (!matstructT) { 1439 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1440 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1441 } 1442 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1443 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1444 1445 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1446 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1447 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1448 mat->num_rows, mat->num_cols, 1449 mat->num_entries, matstructT->alpha, matstructT->descr, 1450 mat->values->data().get(), mat->row_offsets->data().get(), 1451 mat->column_indices->data().get(), xarray->data().get(), matstructT->beta, 1452 yarray->data().get());CHKERRCUSP(stat); 1453 } else { 1454 #if CUDA_VERSION>=4020 1455 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1456 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1457 matstructT->alpha, matstructT->descr, hybMat, 1458 xarray->data().get(), matstructT->beta, 1459 yarray->data().get());CHKERRCUSP(stat); 1460 #endif 1461 } 1462 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1463 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1464 if (!cusparsestruct->stream) { 1465 ierr = WaitForGPU();CHKERRCUSP(ierr); 1466 } 1467 ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1468 PetscFunctionReturn(0); 1469 } 1470 1471 1472 #undef __FUNCT__ 1473 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 1474 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1475 { 1476 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1477 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1478 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1479 CUSPARRAY *xarray,*yarray,*zarray; 1480 PetscErrorCode ierr; 1481 cusparseStatus_t stat; 1482 1483 PetscFunctionBegin; 1484 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1485 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1486 try { 1487 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 1488 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1489 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 1490 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1491 1492 /* multiply add */ 1493 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1494 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1495 /* here we need to be careful to set the number of rows in the multiply to the 1496 number of compressed rows in the matrix ... which is equivalent to the 1497 size of the workVector */ 1498 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1499 mat->num_rows, mat->num_cols, 1500 mat->num_entries, matstruct->alpha, matstruct->descr, 1501 mat->values->data().get(), mat->row_offsets->data().get(), 1502 mat->column_indices->data().get(), xarray->data().get(), matstruct->beta, 1503 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1504 } else { 1505 #if CUDA_VERSION>=4020 1506 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1507 if (cusparsestruct->workVector->size()) { 1508 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1509 matstruct->alpha, matstruct->descr, hybMat, 1510 xarray->data().get(), matstruct->beta, 1511 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1512 } 1513 #endif 1514 } 1515 1516 /* scatter the data from the temporary into the full vector with a += operation */ 1517 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))), 1518 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1519 VecCUSPPlusEquals()); 1520 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1521 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 1522 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1523 1524 } catch(char *ex) { 1525 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1526 } 1527 ierr = WaitForGPU();CHKERRCUSP(ierr); 1528 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1529 PetscFunctionReturn(0); 1530 } 1531 1532 #undef __FUNCT__ 1533 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJCUSPARSE" 1534 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1535 { 1536 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1537 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1538 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1539 CUSPARRAY *xarray,*yarray,*zarray; 1540 PetscErrorCode ierr; 1541 cusparseStatus_t stat; 1542 1543 PetscFunctionBegin; 1544 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1545 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1546 if (!matstructT) { 1547 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1548 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1549 } 1550 1551 try { 1552 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 1553 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1554 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 1555 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1556 1557 /* multiply add with matrix transpose */ 1558 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1559 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1560 /* here we need to be careful to set the number of rows in the multiply to the 1561 number of compressed rows in the matrix ... which is equivalent to the 1562 size of the workVector */ 1563 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1564 mat->num_rows, mat->num_cols, 1565 mat->num_entries, matstructT->alpha, matstructT->descr, 1566 mat->values->data().get(), mat->row_offsets->data().get(), 1567 mat->column_indices->data().get(), xarray->data().get(), matstructT->beta, 1568 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1569 } else { 1570 #if CUDA_VERSION>=4020 1571 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1572 if (cusparsestruct->workVector->size()) { 1573 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1574 matstructT->alpha, matstructT->descr, hybMat, 1575 xarray->data().get(), matstructT->beta, 1576 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1577 } 1578 #endif 1579 } 1580 1581 /* scatter the data from the temporary into the full vector with a += operation */ 1582 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))), 1583 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1584 VecCUSPPlusEquals()); 1585 1586 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1587 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 1588 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1589 1590 } catch(char *ex) { 1591 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1592 } 1593 ierr = WaitForGPU();CHKERRCUSP(ierr); 1594 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1595 PetscFunctionReturn(0); 1596 } 1597 1598 #undef __FUNCT__ 1599 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE" 1600 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1601 { 1602 PetscErrorCode ierr; 1603 1604 PetscFunctionBegin; 1605 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1606 if (A->factortype==MAT_FACTOR_NONE) { 1607 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1608 } 1609 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1610 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1611 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1612 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1613 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1614 PetscFunctionReturn(0); 1615 } 1616 1617 /* --------------------------------------------------------------------------------*/ 1618 #undef __FUNCT__ 1619 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE" 1620 /*@ 1621 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1622 (the default parallel PETSc format). This matrix will ultimately pushed down 1623 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1624 assembly performance the user should preallocate the matrix storage by setting 1625 the parameter nz (or the array nnz). By setting these parameters accurately, 1626 performance during matrix assembly can be increased by more than a factor of 50. 1627 1628 Collective on MPI_Comm 1629 1630 Input Parameters: 1631 + comm - MPI communicator, set to PETSC_COMM_SELF 1632 . m - number of rows 1633 . n - number of columns 1634 . nz - number of nonzeros per row (same for all rows) 1635 - nnz - array containing the number of nonzeros in the various rows 1636 (possibly different for each row) or NULL 1637 1638 Output Parameter: 1639 . A - the matrix 1640 1641 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1642 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1643 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1644 1645 Notes: 1646 If nnz is given then nz is ignored 1647 1648 The AIJ format (also called the Yale sparse matrix format or 1649 compressed row storage), is fully compatible with standard Fortran 77 1650 storage. That is, the stored row and column indices can begin at 1651 either one (as in Fortran) or zero. See the users' manual for details. 1652 1653 Specify the preallocated storage with either nz or nnz (not both). 1654 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1655 allocation. For large problems you MUST preallocate memory or you 1656 will get TERRIBLE performance, see the users' manual chapter on matrices. 1657 1658 By default, this format uses inodes (identical nodes) when possible, to 1659 improve numerical efficiency of matrix-vector products and solves. We 1660 search for consecutive rows with the same nonzero structure, thereby 1661 reusing matrix information to achieve increased efficiency. 1662 1663 Level: intermediate 1664 1665 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1666 @*/ 1667 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1668 { 1669 PetscErrorCode ierr; 1670 1671 PetscFunctionBegin; 1672 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1673 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1674 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1675 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1676 PetscFunctionReturn(0); 1677 } 1678 1679 #undef __FUNCT__ 1680 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE" 1681 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1682 { 1683 PetscErrorCode ierr; 1684 cusparseStatus_t stat; 1685 cudaError_t err; 1686 PetscFunctionBegin; 1687 if (A->factortype==MAT_FACTOR_NONE) { 1688 try { 1689 if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) { 1690 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1691 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1692 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1693 1694 /* delete all the members in each of the data structures */ 1695 if (matstruct) { 1696 if (matstruct->descr) { stat = cusparseDestroyMatDescr(matstruct->descr);CHKERRCUSP(stat); } 1697 if (matstruct->mat) { 1698 if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1699 #if CUDA_VERSION>=4020 1700 cusparseHybMat_t hybMat = (cusparseHybMat_t) matstruct->mat; 1701 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat); 1702 #endif 1703 } else { 1704 CsrMatrix* mat = (CsrMatrix*)matstruct->mat; 1705 if (mat->values) delete (THRUSTARRAY*)mat->values; 1706 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1707 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1708 delete (CsrMatrix*)matstruct->mat; 1709 } 1710 } 1711 if (matstruct->alpha) { err=cudaFree(matstruct->alpha);CHKERRCUSP(err); } 1712 if (matstruct->beta) { err=cudaFree(matstruct->beta);CHKERRCUSP(err); } 1713 if (matstruct->cprowIndices) delete (THRUSTINTARRAY*)matstruct->cprowIndices; 1714 if (matstruct) delete (Mat_SeqAIJCUSPARSEMultStruct*)matstruct; 1715 } 1716 if (matstructT) { 1717 if (matstructT->descr) { stat = cusparseDestroyMatDescr(matstructT->descr);CHKERRCUSP(stat); } 1718 if (matstructT->mat) { 1719 if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1720 #if CUDA_VERSION>=4020 1721 cusparseHybMat_t hybMat = (cusparseHybMat_t) matstructT->mat; 1722 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat); 1723 #endif 1724 } else { 1725 CsrMatrix* matT = (CsrMatrix*)matstructT->mat; 1726 if (matT->values) delete (THRUSTARRAY*)matT->values; 1727 if (matT->column_indices) delete (THRUSTINTARRAY32*)matT->column_indices; 1728 if (matT->row_offsets) delete (THRUSTINTARRAY32*)matT->row_offsets; 1729 delete (CsrMatrix*)matstructT->mat; 1730 } 1731 } 1732 if (matstructT->alpha) { err=cudaFree(matstructT->alpha);CHKERRCUSP(err); } 1733 if (matstructT->beta) { err=cudaFree(matstructT->beta);CHKERRCUSP(err); } 1734 if (matstructT->cprowIndices) delete (THRUSTINTARRAY*)matstructT->cprowIndices; 1735 if (matstructT) delete (Mat_SeqAIJCUSPARSEMultStruct*)matstructT; 1736 } 1737 if (cusparsestruct->workVector) delete (THRUSTARRAY*)cusparsestruct->workVector; 1738 if (cusparsestruct->handle) { stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); } 1739 if (cusparsestruct) delete (Mat_SeqAIJCUSPARSE*)cusparsestruct; 1740 A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 1741 } else { 1742 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1743 if (cusparsestruct->handle) { stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); } 1744 if (cusparsestruct) delete (Mat_SeqAIJCUSPARSE*)cusparsestruct; 1745 } 1746 } catch(char *ex) { 1747 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1748 } 1749 } else { 1750 /* The triangular factors */ 1751 try { 1752 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1753 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1754 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1755 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1756 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1757 1758 /* delete all the members in each of the data structures */ 1759 if (loTriFactor) { 1760 if (loTriFactor->descr) { stat = cusparseDestroyMatDescr(loTriFactor->descr);CHKERRCUSP(stat); } 1761 if (loTriFactor->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(loTriFactor->solveInfo);CHKERRCUSP(stat); } 1762 if (loTriFactor->csrMat) { 1763 if (loTriFactor->csrMat->values) delete (THRUSTARRAY*)loTriFactor->csrMat->values; 1764 if (loTriFactor->csrMat->column_indices) delete (THRUSTINTARRAY32*)loTriFactor->csrMat->column_indices; 1765 if (loTriFactor->csrMat->row_offsets) delete (THRUSTINTARRAY32*)loTriFactor->csrMat->row_offsets; 1766 delete (CsrMatrix*)loTriFactor->csrMat; 1767 } 1768 if (loTriFactor) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)loTriFactor; 1769 } 1770 1771 if (upTriFactor) { 1772 if (upTriFactor->descr) { stat = cusparseDestroyMatDescr(upTriFactor->descr);CHKERRCUSP(stat); } 1773 if (upTriFactor->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(upTriFactor->solveInfo);CHKERRCUSP(stat); } 1774 if (upTriFactor->csrMat) { 1775 if (upTriFactor->csrMat->values) delete (THRUSTARRAY*)upTriFactor->csrMat->values; 1776 if (upTriFactor->csrMat->column_indices) delete (THRUSTINTARRAY32*)upTriFactor->csrMat->column_indices; 1777 if (upTriFactor->csrMat->row_offsets) delete (THRUSTINTARRAY32*)upTriFactor->csrMat->row_offsets; 1778 delete (CsrMatrix*)upTriFactor->csrMat; 1779 } 1780 if (upTriFactor) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)upTriFactor; 1781 } 1782 1783 if (loTriFactorT) { 1784 if (loTriFactorT->descr) { stat = cusparseDestroyMatDescr(loTriFactorT->descr);CHKERRCUSP(stat); } 1785 if (loTriFactorT->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(loTriFactorT->solveInfo);CHKERRCUSP(stat); } 1786 if (loTriFactorT->csrMat) { 1787 if (loTriFactorT->csrMat->values) delete (THRUSTARRAY*)loTriFactorT->csrMat->values; 1788 if (loTriFactorT->csrMat->column_indices) delete (THRUSTINTARRAY32*)loTriFactorT->csrMat->column_indices; 1789 if (loTriFactorT->csrMat->row_offsets) delete (THRUSTINTARRAY32*)loTriFactorT->csrMat->row_offsets; 1790 delete (CsrMatrix*)loTriFactorT->csrMat; 1791 } 1792 if (loTriFactorT) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)loTriFactorT; 1793 } 1794 1795 if (upTriFactorT) { 1796 if (upTriFactorT->descr) { stat = cusparseDestroyMatDescr(upTriFactorT->descr);CHKERRCUSP(stat); } 1797 if (upTriFactorT->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(upTriFactorT->solveInfo);CHKERRCUSP(stat); } 1798 if (upTriFactorT->csrMat) { 1799 if (upTriFactorT->csrMat->values) delete (THRUSTARRAY*)upTriFactorT->csrMat->values; 1800 if (upTriFactorT->csrMat->column_indices) delete (THRUSTINTARRAY32*)upTriFactorT->csrMat->column_indices; 1801 if (upTriFactorT->csrMat->row_offsets) delete (THRUSTINTARRAY32*)upTriFactorT->csrMat->row_offsets; 1802 delete (CsrMatrix*)upTriFactorT->csrMat; 1803 } 1804 if (upTriFactorT) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)upTriFactorT; 1805 } 1806 if (cusparseTriFactors->rpermIndices) delete (THRUSTINTARRAY*)cusparseTriFactors->rpermIndices; 1807 if (cusparseTriFactors->cpermIndices) delete (THRUSTINTARRAY*)cusparseTriFactors->cpermIndices; 1808 if (cusparseTriFactors->workVector) delete (THRUSTARRAY*)cusparseTriFactors->workVector; 1809 if (cusparseTriFactors->handle) { stat = cusparseDestroy(cusparseTriFactors->handle);CHKERRCUSP(stat); } 1810 if (cusparseTriFactors) delete (Mat_SeqAIJCUSPARSETriFactors*)cusparseTriFactors; 1811 } catch(char *ex) { 1812 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1813 } 1814 } 1815 /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */ 1816 A->spptr = 0; 1817 1818 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1819 PetscFunctionReturn(0); 1820 } 1821 1822 #undef __FUNCT__ 1823 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE" 1824 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 1825 { 1826 PetscErrorCode ierr; 1827 cusparseStatus_t stat; 1828 cusparseHandle_t handle=0; 1829 1830 PetscFunctionBegin; 1831 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 1832 if (B->factortype==MAT_FACTOR_NONE) { 1833 /* you cannot check the inode.use flag here since the matrix was just created. 1834 now build a GPU matrix data structure */ 1835 B->spptr = new Mat_SeqAIJCUSPARSE; 1836 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1837 ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0; 1838 ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0; 1839 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1840 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1841 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = 0; 1842 stat = cusparseCreate(&handle);CHKERRCUSP(stat); 1843 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle; 1844 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1845 } else { 1846 /* NEXT, set the pointers to the triangular factors */ 1847 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1848 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 1849 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1850 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0; 1851 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0; 1852 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0; 1853 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0; 1854 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0; 1855 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = 0; 1856 stat = cusparseCreate(&handle);CHKERRCUSP(stat); 1857 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle; 1858 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0; 1859 } 1860 1861 /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the 1862 default cusparse tri solve. Note the difference with the implementation in 1863 MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */ 1864 ierr = PetscObjectComposeFunction((PetscObject)B,"MatGetFactor_petsc_C",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr); 1865 1866 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1867 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1868 B->ops->getvecs = MatCreateVecs_SeqAIJCUSPARSE; 1869 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1870 B->ops->mult = MatMult_SeqAIJCUSPARSE; 1871 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1872 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1873 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1874 1875 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1876 1877 B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 1878 1879 ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1880 PetscFunctionReturn(0); 1881 } 1882 1883 /*M 1884 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1885 1886 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1887 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 1888 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 1889 1890 Options Database Keys: 1891 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1892 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1893 . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1894 1895 Level: beginner 1896 1897 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1898 M*/ 1899