1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 6 #include "petscconf.h" 7 #include "../src/mat/impls/aij/seq/aij.h" /*I "petscmat.h" I*/ 8 #include <../src/mat/impls/sbaij/seq/sbaij.h> 9 #include "../src/vec/vec/impls/dvecimpl.h" 10 #include "petsc-private/vecimpl.h" 11 #undef VecType 12 #include "cusparsematimpl.h" 13 14 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 15 16 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 17 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 18 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 19 20 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 21 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 22 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 23 24 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 26 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 28 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat); 29 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 30 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 31 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 32 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 33 34 #undef __FUNCT__ 35 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse" 36 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type) 37 { 38 PetscFunctionBegin; 39 *type = MATSOLVERCUSPARSE; 40 PetscFunctionReturn(0); 41 } 42 43 /*MC 44 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 45 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 46 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 47 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 48 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 49 algorithms are not recommended. This class does NOT support direct solver operations. 50 51 Consult CUSPARSE documentation for more information about the matrix storage formats 52 which correspond to the options database keys below. 53 54 Options Database Keys: 55 . -mat_cusparse_solve_storage_format csr - sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 56 57 Level: beginner 58 59 .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 60 M*/ 61 62 #undef __FUNCT__ 63 #define __FUNCT__ "MatGetFactor_seqaij_cusparse" 64 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B) 65 { 66 PetscErrorCode ierr; 67 PetscInt n = A->rmap->n; 68 69 PetscFunctionBegin; 70 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 71 (*B)->factortype = ftype; 72 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 73 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 74 75 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 76 ierr = MatSetBlockSizes(*B,A->rmap->bs,A->cmap->bs);CHKERRQ(ierr); 77 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 78 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 79 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 80 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 81 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 82 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 83 84 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 85 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr); 86 PetscFunctionReturn(0); 87 } 88 89 #undef __FUNCT__ 90 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE" 91 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 92 { 93 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 94 95 PetscFunctionBegin; 96 switch (op) { 97 case MAT_CUSPARSE_MULT: 98 cusparsestruct->format = format; 99 break; 100 case MAT_CUSPARSE_SOLVE: 101 cusparseMatSolveStorageFormat = format; 102 break; 103 case MAT_CUSPARSE_ALL: 104 cusparsestruct->format = format; 105 cusparseMatSolveStorageFormat = format; 106 break; 107 default: 108 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL are currently supported.",op); 109 } 110 PetscFunctionReturn(0); 111 } 112 113 /*@ 114 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 115 operation. Only the MatMult operation can use different GPU storage formats 116 for MPIAIJCUSPARSE matrices. 117 Not Collective 118 119 Input Parameters: 120 + A - Matrix of type SEQAIJCUSPARSE 121 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 122 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB) 123 124 Output Parameter: 125 126 Level: intermediate 127 128 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 129 @*/ 130 #undef __FUNCT__ 131 #define __FUNCT__ "MatCUSPARSESetFormat" 132 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 133 { 134 PetscErrorCode ierr; 135 136 PetscFunctionBegin; 137 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 138 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 139 PetscFunctionReturn(0); 140 } 141 142 #undef __FUNCT__ 143 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE" 144 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A) 145 { 146 PetscErrorCode ierr; 147 MatCUSPARSEStorageFormat format; 148 PetscBool flg; 149 150 PetscFunctionBegin; 151 ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr); 152 ierr = PetscObjectOptionsBegin((PetscObject)A); 153 if (A->factortype==MAT_FACTOR_NONE) { 154 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 155 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 156 if (flg) { 157 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 158 } 159 } else { 160 ierr = PetscOptionsEnum("-mat_cusparse_solve_storage_format","sets storage format of (seq)aijcusparse gpu matrices for TriSolve", 161 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 162 if (flg) { 163 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_SOLVE,format);CHKERRQ(ierr); 164 } 165 } 166 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 167 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 168 if (flg) { 169 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 170 } 171 ierr = PetscOptionsEnd();CHKERRQ(ierr); 172 PetscFunctionReturn(0); 173 174 } 175 176 #undef __FUNCT__ 177 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE" 178 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 179 { 180 PetscErrorCode ierr; 181 182 PetscFunctionBegin; 183 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 184 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 185 PetscFunctionReturn(0); 186 } 187 188 #undef __FUNCT__ 189 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE" 190 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 191 { 192 PetscErrorCode ierr; 193 194 PetscFunctionBegin; 195 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 196 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 197 PetscFunctionReturn(0); 198 } 199 200 #undef __FUNCT__ 201 #define __FUNCT__ "MatICCFactorSymbolic_SeqAIJCUSPARSE" 202 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 203 { 204 PetscErrorCode ierr; 205 206 PetscFunctionBegin; 207 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 208 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 209 PetscFunctionReturn(0); 210 } 211 212 #undef __FUNCT__ 213 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqAIJCUSPARSE" 214 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 215 { 216 PetscErrorCode ierr; 217 218 PetscFunctionBegin; 219 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 220 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 221 PetscFunctionReturn(0); 222 } 223 224 #undef __FUNCT__ 225 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILULowerTriMatrix" 226 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 227 { 228 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 229 PetscInt n = A->rmap->n; 230 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 231 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 232 cusparseStatus_t stat; 233 const PetscInt *ai = a->i,*aj = a->j,*vi; 234 const MatScalar *aa = a->a,*v; 235 PetscInt *AiLo, *AjLo; 236 PetscScalar *AALo; 237 PetscInt i,nz, nzLower, offset, rowOffset; 238 PetscErrorCode ierr; 239 240 PetscFunctionBegin; 241 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 242 try { 243 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 244 nzLower=n+ai[n]-ai[1]; 245 246 /* Allocate Space for the lower triangular matrix */ 247 ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 248 ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr); 249 ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr); 250 251 /* Fill the lower triangular matrix */ 252 AiLo[0] = (PetscInt) 0; 253 AiLo[n] = nzLower; 254 AjLo[0] = (PetscInt) 0; 255 AALo[0] = (MatScalar) 1.0; 256 v = aa; 257 vi = aj; 258 offset = 1; 259 rowOffset= 1; 260 for (i=1; i<n; i++) { 261 nz = ai[i+1] - ai[i]; 262 /* additional 1 for the term on the diagonal */ 263 AiLo[i] = rowOffset; 264 rowOffset += nz+1; 265 266 ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 267 ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 268 269 offset += nz; 270 AjLo[offset] = (PetscInt) i; 271 AALo[offset] = (MatScalar) 1.0; 272 offset += 1; 273 274 v += nz; 275 vi += nz; 276 } 277 278 /* allocate space for the triangular factor information */ 279 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 280 281 /* Create the matrix description */ 282 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat); 283 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 284 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 285 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat); 286 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat); 287 288 /* Create the solve analysis information */ 289 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat); 290 291 /* set the operation */ 292 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 293 294 /* set the matrix */ 295 loTriFactor->csrMat = new CsrMatrix; 296 loTriFactor->csrMat->num_rows = n; 297 loTriFactor->csrMat->num_cols = n; 298 loTriFactor->csrMat->num_entries = nzLower; 299 300 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 301 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 302 303 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 304 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 305 306 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 307 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 308 309 /* perform the solve analysis */ 310 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 311 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 312 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 313 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat); 314 315 /* assign the pointer. Is this really necessary? */ 316 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 317 318 ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr); 319 ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr); 320 ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 321 } catch(char *ex) { 322 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 323 } 324 } 325 PetscFunctionReturn(0); 326 } 327 328 #undef __FUNCT__ 329 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILUUpperTriMatrix" 330 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 331 { 332 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 333 PetscInt n = A->rmap->n; 334 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 335 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 336 cusparseStatus_t stat; 337 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 338 const MatScalar *aa = a->a,*v; 339 PetscInt *AiUp, *AjUp; 340 PetscScalar *AAUp; 341 PetscInt i,nz, nzUpper, offset; 342 PetscErrorCode ierr; 343 344 PetscFunctionBegin; 345 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 346 try { 347 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 348 nzUpper = adiag[0]-adiag[n]; 349 350 /* Allocate Space for the upper triangular matrix */ 351 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 352 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 353 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 354 355 /* Fill the upper triangular matrix */ 356 AiUp[0]=(PetscInt) 0; 357 AiUp[n]=nzUpper; 358 offset = nzUpper; 359 for (i=n-1; i>=0; i--) { 360 v = aa + adiag[i+1] + 1; 361 vi = aj + adiag[i+1] + 1; 362 363 /* number of elements NOT on the diagonal */ 364 nz = adiag[i] - adiag[i+1]-1; 365 366 /* decrement the offset */ 367 offset -= (nz+1); 368 369 /* first, set the diagonal elements */ 370 AjUp[offset] = (PetscInt) i; 371 AAUp[offset] = 1./v[nz]; 372 AiUp[i] = AiUp[i+1] - (nz+1); 373 374 ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 375 ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 376 } 377 378 /* allocate space for the triangular factor information */ 379 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 380 381 /* Create the matrix description */ 382 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat); 383 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 384 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 385 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 386 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 387 388 /* Create the solve analysis information */ 389 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat); 390 391 /* set the operation */ 392 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 393 394 /* set the matrix */ 395 upTriFactor->csrMat = new CsrMatrix; 396 upTriFactor->csrMat->num_rows = n; 397 upTriFactor->csrMat->num_cols = n; 398 upTriFactor->csrMat->num_entries = nzUpper; 399 400 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 401 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 402 403 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 404 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 405 406 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 407 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 408 409 /* perform the solve analysis */ 410 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 411 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 412 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 413 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat); 414 415 /* assign the pointer. Is this really necessary? */ 416 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 417 418 ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 419 ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 420 ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 421 } catch(char *ex) { 422 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 423 } 424 } 425 PetscFunctionReturn(0); 426 } 427 428 #undef __FUNCT__ 429 #define __FUNCT__ "MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU" 430 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 431 { 432 PetscErrorCode ierr; 433 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 434 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 435 IS isrow = a->row,iscol = a->icol; 436 PetscBool row_identity,col_identity; 437 const PetscInt *r,*c; 438 PetscInt n = A->rmap->n; 439 440 PetscFunctionBegin; 441 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 442 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 443 444 cusparseTriFactors->workVector = new THRUSTARRAY; 445 cusparseTriFactors->workVector->resize(n); 446 cusparseTriFactors->nnz=a->nz; 447 448 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 449 /*lower triangular indices */ 450 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 451 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 452 if (!row_identity) { 453 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 454 cusparseTriFactors->rpermIndices->assign(r, r+n); 455 } 456 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 457 458 /*upper triangular indices */ 459 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 460 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 461 if (!col_identity) { 462 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 463 cusparseTriFactors->cpermIndices->assign(c, c+n); 464 } 465 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 466 PetscFunctionReturn(0); 467 } 468 469 #undef __FUNCT__ 470 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildICCTriMatrices" 471 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 472 { 473 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 474 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 475 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 476 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 477 cusparseStatus_t stat; 478 PetscErrorCode ierr; 479 PetscInt *AiUp, *AjUp; 480 PetscScalar *AAUp; 481 PetscScalar *AALo; 482 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 483 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 484 const PetscInt *ai = b->i,*aj = b->j,*vj; 485 const MatScalar *aa = b->a,*v; 486 487 PetscFunctionBegin; 488 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 489 try { 490 /* Allocate Space for the upper triangular matrix */ 491 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 492 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 493 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 494 ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 495 496 /* Fill the upper triangular matrix */ 497 AiUp[0]=(PetscInt) 0; 498 AiUp[n]=nzUpper; 499 offset = 0; 500 for (i=0; i<n; i++) { 501 /* set the pointers */ 502 v = aa + ai[i]; 503 vj = aj + ai[i]; 504 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 505 506 /* first, set the diagonal elements */ 507 AjUp[offset] = (PetscInt) i; 508 AAUp[offset] = 1.0/v[nz]; 509 AiUp[i] = offset; 510 AALo[offset] = 1.0/v[nz]; 511 512 offset+=1; 513 if (nz>0) { 514 ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr); 515 ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 516 for (j=offset; j<offset+nz; j++) { 517 AAUp[j] = -AAUp[j]; 518 AALo[j] = AAUp[j]/v[nz]; 519 } 520 offset+=nz; 521 } 522 } 523 524 /* allocate space for the triangular factor information */ 525 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 526 527 /* Create the matrix description */ 528 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat); 529 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 530 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 531 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 532 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat); 533 534 /* Create the solve analysis information */ 535 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat); 536 537 /* set the operation */ 538 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 539 540 /* set the matrix */ 541 upTriFactor->csrMat = new CsrMatrix; 542 upTriFactor->csrMat->num_rows = A->rmap->n; 543 upTriFactor->csrMat->num_cols = A->cmap->n; 544 upTriFactor->csrMat->num_entries = a->nz; 545 546 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 547 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 548 549 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 550 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 551 552 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 553 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 554 555 /* perform the solve analysis */ 556 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 557 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 558 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 559 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat); 560 561 /* assign the pointer. Is this really necessary? */ 562 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 563 564 /* allocate space for the triangular factor information */ 565 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 566 567 /* Create the matrix description */ 568 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat); 569 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 570 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 571 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 572 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 573 574 /* Create the solve analysis information */ 575 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat); 576 577 /* set the operation */ 578 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 579 580 /* set the matrix */ 581 loTriFactor->csrMat = new CsrMatrix; 582 loTriFactor->csrMat->num_rows = A->rmap->n; 583 loTriFactor->csrMat->num_cols = A->cmap->n; 584 loTriFactor->csrMat->num_entries = a->nz; 585 586 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 587 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 588 589 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 590 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 591 592 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 593 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 594 595 /* perform the solve analysis */ 596 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 597 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 598 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 599 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat); 600 601 /* assign the pointer. Is this really necessary? */ 602 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 603 604 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 605 ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 606 ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 607 ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 608 ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 609 } catch(char *ex) { 610 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 611 } 612 } 613 PetscFunctionReturn(0); 614 } 615 616 #undef __FUNCT__ 617 #define __FUNCT__ "MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU" 618 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 619 { 620 PetscErrorCode ierr; 621 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 622 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 623 IS ip = a->row; 624 const PetscInt *rip; 625 PetscBool perm_identity; 626 PetscInt n = A->rmap->n; 627 628 PetscFunctionBegin; 629 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 630 cusparseTriFactors->workVector = new THRUSTARRAY; 631 cusparseTriFactors->workVector->resize(n); 632 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 633 634 /*lower triangular indices */ 635 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 636 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 637 if (!perm_identity) { 638 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 639 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 640 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 641 cusparseTriFactors->cpermIndices->assign(rip, rip+n); 642 } 643 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 644 PetscFunctionReturn(0); 645 } 646 647 #undef __FUNCT__ 648 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE" 649 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 650 { 651 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 652 IS isrow = b->row,iscol = b->col; 653 PetscBool row_identity,col_identity; 654 PetscErrorCode ierr; 655 656 PetscFunctionBegin; 657 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 658 /* determine which version of MatSolve needs to be used. */ 659 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 660 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 661 if (row_identity && col_identity) { 662 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 663 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 664 } else { 665 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 666 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 667 } 668 669 /* get the triangular factors */ 670 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 671 PetscFunctionReturn(0); 672 } 673 674 #undef __FUNCT__ 675 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqAIJCUSPARSE" 676 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 677 { 678 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 679 IS ip = b->row; 680 PetscBool perm_identity; 681 PetscErrorCode ierr; 682 683 PetscFunctionBegin; 684 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 685 686 /* determine which version of MatSolve needs to be used. */ 687 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 688 if (perm_identity) { 689 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 690 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 691 } else { 692 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 693 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 694 } 695 696 /* get the triangular factors */ 697 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 698 PetscFunctionReturn(0); 699 } 700 701 #undef __FUNCT__ 702 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve" 703 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 704 { 705 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 706 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 707 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 708 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 709 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 710 cusparseStatus_t stat; 711 cusparseIndexBase_t indexBase; 712 cusparseMatrixType_t matrixType; 713 cusparseFillMode_t fillMode; 714 cusparseDiagType_t diagType; 715 716 PetscFunctionBegin; 717 718 /*********************************************/ 719 /* Now the Transpose of the Lower Tri Factor */ 720 /*********************************************/ 721 722 /* allocate space for the transpose of the lower triangular factor */ 723 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 724 725 /* set the matrix descriptors of the lower triangular factor */ 726 matrixType = cusparseGetMatType(loTriFactor->descr); 727 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 728 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 729 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 730 diagType = cusparseGetMatDiagType(loTriFactor->descr); 731 732 /* Create the matrix description */ 733 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSP(stat); 734 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSP(stat); 735 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSP(stat); 736 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSP(stat); 737 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSP(stat); 738 739 /* Create the solve analysis information */ 740 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSP(stat); 741 742 /* set the operation */ 743 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 744 745 /* allocate GPU space for the CSC of the lower triangular factor*/ 746 loTriFactorT->csrMat = new CsrMatrix; 747 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 748 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 749 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 750 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 751 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 752 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 753 754 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 755 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 756 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 757 loTriFactor->csrMat->values->data().get(), 758 loTriFactor->csrMat->row_offsets->data().get(), 759 loTriFactor->csrMat->column_indices->data().get(), 760 loTriFactorT->csrMat->values->data().get(), 761 loTriFactorT->csrMat->column_indices->data().get(), 762 loTriFactorT->csrMat->row_offsets->data().get(), 763 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 764 765 /* perform the solve analysis on the transposed matrix */ 766 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 767 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 768 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 769 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 770 loTriFactorT->solveInfo);CHKERRCUSP(stat); 771 772 /* assign the pointer. Is this really necessary? */ 773 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 774 775 /*********************************************/ 776 /* Now the Transpose of the Upper Tri Factor */ 777 /*********************************************/ 778 779 /* allocate space for the transpose of the upper triangular factor */ 780 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 781 782 /* set the matrix descriptors of the upper triangular factor */ 783 matrixType = cusparseGetMatType(upTriFactor->descr); 784 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 785 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 786 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 787 diagType = cusparseGetMatDiagType(upTriFactor->descr); 788 789 /* Create the matrix description */ 790 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSP(stat); 791 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSP(stat); 792 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSP(stat); 793 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSP(stat); 794 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSP(stat); 795 796 /* Create the solve analysis information */ 797 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSP(stat); 798 799 /* set the operation */ 800 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 801 802 /* allocate GPU space for the CSC of the upper triangular factor*/ 803 upTriFactorT->csrMat = new CsrMatrix; 804 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 805 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 806 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 807 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 808 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 809 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 810 811 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 812 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 813 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 814 upTriFactor->csrMat->values->data().get(), 815 upTriFactor->csrMat->row_offsets->data().get(), 816 upTriFactor->csrMat->column_indices->data().get(), 817 upTriFactorT->csrMat->values->data().get(), 818 upTriFactorT->csrMat->column_indices->data().get(), 819 upTriFactorT->csrMat->row_offsets->data().get(), 820 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 821 822 /* perform the solve analysis on the transposed matrix */ 823 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 824 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 825 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 826 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 827 upTriFactorT->solveInfo);CHKERRCUSP(stat); 828 829 /* assign the pointer. Is this really necessary? */ 830 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 831 PetscFunctionReturn(0); 832 } 833 834 #undef __FUNCT__ 835 #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult" 836 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 837 { 838 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 839 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 840 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 841 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 842 cusparseStatus_t stat; 843 cusparseIndexBase_t indexBase; 844 845 PetscFunctionBegin; 846 847 /* allocate space for the triangular factor information */ 848 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 849 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSP(stat); 850 indexBase = cusparseGetMatIndexBase(matstruct->descr); 851 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSP(stat); 852 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat); 853 854 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 855 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 856 CsrMatrix *matrixT= new CsrMatrix; 857 matrixT->num_rows = A->rmap->n; 858 matrixT->num_cols = A->cmap->n; 859 matrixT->num_entries = a->nz; 860 matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 861 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 862 matrixT->values = new THRUSTARRAY(a->nz); 863 864 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 865 indexBase = cusparseGetMatIndexBase(matstruct->descr); 866 stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows, 867 matrix->num_cols, matrix->num_entries, 868 matrix->values->data().get(), 869 matrix->row_offsets->data().get(), 870 matrix->column_indices->data().get(), 871 matrixT->values->data().get(), 872 matrixT->column_indices->data().get(), 873 matrixT->row_offsets->data().get(), 874 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 875 876 /* assign the pointer */ 877 matstructT->mat = matrixT; 878 879 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 880 881 /* First convert HYB to CSR */ 882 CsrMatrix *temp= new CsrMatrix; 883 temp->num_rows = A->rmap->n; 884 temp->num_cols = A->cmap->n; 885 temp->num_entries = a->nz; 886 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 887 temp->column_indices = new THRUSTINTARRAY32(a->nz); 888 temp->values = new THRUSTARRAY(a->nz); 889 890 stat = cusparse_hyb2csr(cusparsestruct->handle, 891 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 892 temp->values->data().get(), 893 temp->row_offsets->data().get(), 894 temp->column_indices->data().get());CHKERRCUSP(stat); 895 896 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 897 CsrMatrix *tempT= new CsrMatrix; 898 tempT->num_rows = A->rmap->n; 899 tempT->num_cols = A->cmap->n; 900 tempT->num_entries = a->nz; 901 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 902 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 903 tempT->values = new THRUSTARRAY(a->nz); 904 905 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 906 temp->num_cols, temp->num_entries, 907 temp->values->data().get(), 908 temp->row_offsets->data().get(), 909 temp->column_indices->data().get(), 910 tempT->values->data().get(), 911 tempT->column_indices->data().get(), 912 tempT->row_offsets->data().get(), 913 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 914 915 /* Last, convert CSC to HYB */ 916 cusparseHybMat_t hybMat; 917 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat); 918 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 919 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 920 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 921 matstructT->descr, tempT->values->data().get(), 922 tempT->row_offsets->data().get(), 923 tempT->column_indices->data().get(), 924 hybMat, 0, partition);CHKERRCUSP(stat); 925 926 /* assign the pointer */ 927 matstructT->mat = hybMat; 928 929 /* delete temporaries */ 930 if (tempT) { 931 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 932 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 933 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 934 delete (CsrMatrix*) tempT; 935 } 936 if (temp) { 937 if (temp->values) delete (THRUSTARRAY*) temp->values; 938 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 939 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 940 delete (CsrMatrix*) temp; 941 } 942 } 943 /* assign the compressed row indices */ 944 matstructT->cprowIndices = new THRUSTINTARRAY; 945 946 /* assign the pointer */ 947 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 948 PetscFunctionReturn(0); 949 } 950 951 #undef __FUNCT__ 952 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE" 953 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 954 { 955 CUSPARRAY *xGPU, *bGPU; 956 cusparseStatus_t stat; 957 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 958 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 959 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 960 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 961 PetscErrorCode ierr; 962 963 PetscFunctionBegin; 964 /* Analyze the matrix and create the transpose ... on the fly */ 965 if (!loTriFactorT && !upTriFactorT) { 966 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 967 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 968 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 969 } 970 971 /* Get the GPU pointers */ 972 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 973 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 974 975 /* First, reorder with the row permutation */ 976 thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()), 977 thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()), 978 xGPU->begin()); 979 980 /* First, solve U */ 981 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 982 upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 983 upTriFactorT->csrMat->values->data().get(), 984 upTriFactorT->csrMat->row_offsets->data().get(), 985 upTriFactorT->csrMat->column_indices->data().get(), 986 upTriFactorT->solveInfo, 987 xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 988 989 /* Then, solve L */ 990 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 991 loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 992 loTriFactorT->csrMat->values->data().get(), 993 loTriFactorT->csrMat->row_offsets->data().get(), 994 loTriFactorT->csrMat->column_indices->data().get(), 995 loTriFactorT->solveInfo, 996 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 997 998 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 999 thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1000 thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()), 1001 tempGPU->begin()); 1002 1003 /* Copy the temporary to the full solution. */ 1004 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin()); 1005 1006 /* restore */ 1007 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1008 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1009 ierr = WaitForGPU();CHKERRCUSP(ierr); 1010 1011 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1012 PetscFunctionReturn(0); 1013 } 1014 1015 #undef __FUNCT__ 1016 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering" 1017 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1018 { 1019 CUSPARRAY *xGPU,*bGPU; 1020 cusparseStatus_t stat; 1021 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1022 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1023 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1024 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1025 PetscErrorCode ierr; 1026 1027 PetscFunctionBegin; 1028 /* Analyze the matrix and create the transpose ... on the fly */ 1029 if (!loTriFactorT && !upTriFactorT) { 1030 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1031 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1032 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1033 } 1034 1035 /* Get the GPU pointers */ 1036 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1037 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1038 1039 /* First, solve U */ 1040 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1041 upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 1042 upTriFactorT->csrMat->values->data().get(), 1043 upTriFactorT->csrMat->row_offsets->data().get(), 1044 upTriFactorT->csrMat->column_indices->data().get(), 1045 upTriFactorT->solveInfo, 1046 bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1047 1048 /* Then, solve L */ 1049 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1050 loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 1051 loTriFactorT->csrMat->values->data().get(), 1052 loTriFactorT->csrMat->row_offsets->data().get(), 1053 loTriFactorT->csrMat->column_indices->data().get(), 1054 loTriFactorT->solveInfo, 1055 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1056 1057 /* restore */ 1058 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1059 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1060 ierr = WaitForGPU();CHKERRCUSP(ierr); 1061 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1062 PetscFunctionReturn(0); 1063 } 1064 1065 #undef __FUNCT__ 1066 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE" 1067 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1068 { 1069 CUSPARRAY *xGPU,*bGPU; 1070 cusparseStatus_t stat; 1071 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1072 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1073 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1074 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1075 PetscErrorCode ierr; 1076 1077 PetscFunctionBegin; 1078 /* Get the GPU pointers */ 1079 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1080 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1081 1082 /* First, reorder with the row permutation */ 1083 thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()), 1084 thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()), 1085 xGPU->begin()); 1086 1087 /* Next, solve L */ 1088 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1089 loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1090 loTriFactor->csrMat->values->data().get(), 1091 loTriFactor->csrMat->row_offsets->data().get(), 1092 loTriFactor->csrMat->column_indices->data().get(), 1093 loTriFactor->solveInfo, 1094 xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1095 1096 /* Then, solve U */ 1097 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1098 upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1099 upTriFactor->csrMat->values->data().get(), 1100 upTriFactor->csrMat->row_offsets->data().get(), 1101 upTriFactor->csrMat->column_indices->data().get(), 1102 upTriFactor->solveInfo, 1103 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1104 1105 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1106 thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1107 thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()), 1108 tempGPU->begin()); 1109 1110 /* Copy the temporary to the full solution. */ 1111 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin()); 1112 1113 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1114 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1115 ierr = WaitForGPU();CHKERRCUSP(ierr); 1116 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1117 PetscFunctionReturn(0); 1118 } 1119 1120 #undef __FUNCT__ 1121 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering" 1122 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1123 { 1124 CUSPARRAY *xGPU,*bGPU; 1125 cusparseStatus_t stat; 1126 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1127 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1128 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1129 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1130 PetscErrorCode ierr; 1131 1132 PetscFunctionBegin; 1133 /* Get the GPU pointers */ 1134 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1135 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1136 1137 /* First, solve L */ 1138 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1139 loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1140 loTriFactor->csrMat->values->data().get(), 1141 loTriFactor->csrMat->row_offsets->data().get(), 1142 loTriFactor->csrMat->column_indices->data().get(), 1143 loTriFactor->solveInfo, 1144 bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1145 1146 /* Next, solve U */ 1147 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1148 upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1149 upTriFactor->csrMat->values->data().get(), 1150 upTriFactor->csrMat->row_offsets->data().get(), 1151 upTriFactor->csrMat->column_indices->data().get(), 1152 upTriFactor->solveInfo, 1153 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1154 1155 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1156 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1157 ierr = WaitForGPU();CHKERRCUSP(ierr); 1158 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1159 PetscFunctionReturn(0); 1160 } 1161 1162 #undef __FUNCT__ 1163 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU" 1164 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1165 { 1166 1167 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1168 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1169 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1170 PetscInt m = A->rmap->n,*ii,*ridx; 1171 PetscErrorCode ierr; 1172 cusparseStatus_t stat; 1173 1174 PetscFunctionBegin; 1175 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 1176 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1177 if (matstruct) { 1178 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Data structure should not be initialized."); 1179 } 1180 try { 1181 cusparsestruct->nonzerorow=0; 1182 for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0); 1183 1184 if (a->compressedrow.use) { 1185 m = a->compressedrow.nrows; 1186 ii = a->compressedrow.i; 1187 ridx = a->compressedrow.rindex; 1188 } else { 1189 /* Forcing compressed row on the GPU ... only relevant for CSR storage */ 1190 int k=0; 1191 ierr = PetscMalloc((cusparsestruct->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr); 1192 ierr = PetscMalloc((cusparsestruct->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr); 1193 ii[0]=0; 1194 for (int j = 0; j<m; j++) { 1195 if ((a->i[j+1]-a->i[j])>0) { 1196 ii[k] = a->i[j]; 1197 ridx[k]= j; 1198 k++; 1199 } 1200 } 1201 ii[cusparsestruct->nonzerorow] = a->nz; 1202 1203 m = cusparsestruct->nonzerorow; 1204 } 1205 1206 /* allocate space for the triangular factor information */ 1207 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1208 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSP(stat); 1209 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 1210 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat); 1211 1212 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1213 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1214 /* set the matrix */ 1215 CsrMatrix *matrix= new CsrMatrix; 1216 matrix->num_rows = A->rmap->n; 1217 matrix->num_cols = A->cmap->n; 1218 matrix->num_entries = a->nz; 1219 matrix->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1220 matrix->row_offsets->assign(ii, ii + A->rmap->n+1); 1221 1222 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1223 matrix->column_indices->assign(a->j, a->j+a->nz); 1224 1225 matrix->values = new THRUSTARRAY(a->nz); 1226 matrix->values->assign(a->a, a->a+a->nz); 1227 1228 /* assign the pointer */ 1229 matstruct->mat = matrix; 1230 1231 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1232 1233 CsrMatrix *matrix= new CsrMatrix; 1234 matrix->num_rows = A->rmap->n; 1235 matrix->num_cols = A->cmap->n; 1236 matrix->num_entries = a->nz; 1237 matrix->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1238 matrix->row_offsets->assign(ii, ii + A->rmap->n+1); 1239 1240 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1241 matrix->column_indices->assign(a->j, a->j+a->nz); 1242 1243 matrix->values = new THRUSTARRAY(a->nz); 1244 matrix->values->assign(a->a, a->a+a->nz); 1245 1246 cusparseHybMat_t hybMat; 1247 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat); 1248 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1249 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1250 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1251 matstruct->descr, matrix->values->data().get(), 1252 matrix->row_offsets->data().get(), 1253 matrix->column_indices->data().get(), 1254 hybMat, 0, partition);CHKERRCUSP(stat); 1255 /* assign the pointer */ 1256 matstruct->mat = hybMat; 1257 1258 if (matrix) { 1259 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1260 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1261 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1262 delete (CsrMatrix*)matrix; 1263 } 1264 } 1265 1266 /* assign the compressed row indices */ 1267 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1268 matstruct->cprowIndices->assign(ridx,ridx+m); 1269 1270 /* assign the pointer */ 1271 cusparsestruct->mat = matstruct; 1272 1273 if (!a->compressedrow.use) { 1274 ierr = PetscFree(ii);CHKERRQ(ierr); 1275 ierr = PetscFree(ridx);CHKERRQ(ierr); 1276 } 1277 cusparsestruct->workVector = new THRUSTARRAY; 1278 cusparsestruct->workVector->resize(m); 1279 } catch(char *ex) { 1280 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1281 } 1282 ierr = WaitForGPU();CHKERRCUSP(ierr); 1283 1284 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 1285 1286 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1287 } 1288 PetscFunctionReturn(0); 1289 } 1290 1291 #undef __FUNCT__ 1292 #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE" 1293 static PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left) 1294 { 1295 PetscErrorCode ierr; 1296 1297 PetscFunctionBegin; 1298 if (right) { 1299 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 1300 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 1301 ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr); 1302 ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr); 1303 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 1304 } 1305 if (left) { 1306 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 1307 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 1308 ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr); 1309 ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr); 1310 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 1311 } 1312 PetscFunctionReturn(0); 1313 } 1314 1315 struct VecCUSPPlusEquals 1316 { 1317 template <typename Tuple> 1318 __host__ __device__ 1319 void operator()(Tuple t) 1320 { 1321 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1322 } 1323 }; 1324 1325 #undef __FUNCT__ 1326 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE" 1327 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1328 { 1329 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1330 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1331 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1332 CUSPARRAY *xarray,*yarray; 1333 PetscErrorCode ierr; 1334 cusparseStatus_t stat; 1335 1336 PetscFunctionBegin; 1337 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1338 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1339 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1340 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1341 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1342 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1343 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1344 mat->num_rows, mat->num_cols, mat->num_entries, 1345 &ALPHA, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), 1346 mat->column_indices->data().get(), xarray->data().get(), &BETA, 1347 yarray->data().get());CHKERRCUSP(stat); 1348 } else { 1349 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1350 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1351 &ALPHA, matstruct->descr, hybMat, 1352 xarray->data().get(), &BETA, 1353 yarray->data().get());CHKERRCUSP(stat); 1354 } 1355 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1356 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1357 if (!cusparsestruct->stream) { 1358 ierr = WaitForGPU();CHKERRCUSP(ierr); 1359 } 1360 ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1361 PetscFunctionReturn(0); 1362 } 1363 1364 #undef __FUNCT__ 1365 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE" 1366 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1367 { 1368 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1369 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1370 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1371 CUSPARRAY *xarray,*yarray; 1372 PetscErrorCode ierr; 1373 cusparseStatus_t stat; 1374 1375 PetscFunctionBegin; 1376 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1377 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1378 if (!matstructT) { 1379 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1380 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1381 } 1382 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1383 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1384 1385 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1386 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1387 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1388 mat->num_rows, mat->num_cols, 1389 mat->num_entries, &ALPHA, matstructT->descr, 1390 mat->values->data().get(), mat->row_offsets->data().get(), 1391 mat->column_indices->data().get(), xarray->data().get(), &BETA, 1392 yarray->data().get());CHKERRCUSP(stat); 1393 } else { 1394 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1395 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1396 &ALPHA, matstructT->descr, hybMat, 1397 xarray->data().get(), &BETA, 1398 yarray->data().get());CHKERRCUSP(stat); 1399 } 1400 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1401 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1402 if (!cusparsestruct->stream) { 1403 ierr = WaitForGPU();CHKERRCUSP(ierr); 1404 } 1405 ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1406 PetscFunctionReturn(0); 1407 } 1408 1409 1410 #undef __FUNCT__ 1411 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 1412 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1413 { 1414 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1415 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1416 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1417 CUSPARRAY *xarray,*yarray,*zarray; 1418 PetscErrorCode ierr; 1419 cusparseStatus_t stat; 1420 1421 PetscFunctionBegin; 1422 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1423 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1424 try { 1425 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 1426 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1427 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 1428 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1429 1430 /* multiply add */ 1431 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1432 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1433 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1434 mat->num_rows, mat->num_cols, 1435 mat->num_entries, &ALPHA, matstruct->descr, 1436 mat->values->data().get(), mat->row_offsets->data().get(), 1437 mat->column_indices->data().get(), xarray->data().get(), &BETA, 1438 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1439 } else { 1440 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1441 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1442 &ALPHA, matstruct->descr, hybMat, 1443 xarray->data().get(), &BETA, 1444 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1445 } 1446 1447 /* scatter the data from the temporary into the full vector with a += operation */ 1448 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))), 1449 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1450 VecCUSPPlusEquals()); 1451 1452 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1453 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 1454 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1455 1456 } catch(char *ex) { 1457 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1458 } 1459 ierr = WaitForGPU();CHKERRCUSP(ierr); 1460 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1461 PetscFunctionReturn(0); 1462 } 1463 1464 #undef __FUNCT__ 1465 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJCUSPARSE" 1466 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1467 { 1468 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1469 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1470 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1471 CUSPARRAY *xarray,*yarray,*zarray; 1472 PetscErrorCode ierr; 1473 cusparseStatus_t stat; 1474 1475 PetscFunctionBegin; 1476 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1477 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1478 if (!matstructT) { 1479 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1480 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1481 } 1482 1483 try { 1484 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 1485 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1486 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 1487 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1488 1489 /* multiply add with matrix transpose */ 1490 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1491 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1492 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1493 mat->num_rows, mat->num_cols, mat->num_entries, &ALPHA, matstructT->descr, 1494 mat->values->data().get(), mat->row_offsets->data().get(), 1495 mat->column_indices->data().get(), xarray->data().get(), &BETA, 1496 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1497 } else { 1498 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1499 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1500 &ALPHA, matstructT->descr, hybMat, 1501 xarray->data().get(), &BETA, 1502 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1503 } 1504 1505 /* scatter the data from the temporary into the full vector with a += operation */ 1506 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))), 1507 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1508 VecCUSPPlusEquals()); 1509 1510 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1511 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 1512 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1513 1514 } catch(char *ex) { 1515 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1516 } 1517 ierr = WaitForGPU();CHKERRCUSP(ierr); 1518 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1519 PetscFunctionReturn(0); 1520 } 1521 1522 #undef __FUNCT__ 1523 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE" 1524 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1525 { 1526 PetscErrorCode ierr; 1527 1528 PetscFunctionBegin; 1529 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1530 if (A->factortype==MAT_FACTOR_NONE) { 1531 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1532 } 1533 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1534 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1535 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1536 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1537 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1538 PetscFunctionReturn(0); 1539 } 1540 1541 /* --------------------------------------------------------------------------------*/ 1542 #undef __FUNCT__ 1543 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE" 1544 /*@ 1545 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1546 (the default parallel PETSc format). This matrix will ultimately pushed down 1547 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1548 assembly performance the user should preallocate the matrix storage by setting 1549 the parameter nz (or the array nnz). By setting these parameters accurately, 1550 performance during matrix assembly can be increased by more than a factor of 50. 1551 1552 Collective on MPI_Comm 1553 1554 Input Parameters: 1555 + comm - MPI communicator, set to PETSC_COMM_SELF 1556 . m - number of rows 1557 . n - number of columns 1558 . nz - number of nonzeros per row (same for all rows) 1559 - nnz - array containing the number of nonzeros in the various rows 1560 (possibly different for each row) or NULL 1561 1562 Output Parameter: 1563 . A - the matrix 1564 1565 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1566 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1567 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1568 1569 Notes: 1570 If nnz is given then nz is ignored 1571 1572 The AIJ format (also called the Yale sparse matrix format or 1573 compressed row storage), is fully compatible with standard Fortran 77 1574 storage. That is, the stored row and column indices can begin at 1575 either one (as in Fortran) or zero. See the users' manual for details. 1576 1577 Specify the preallocated storage with either nz or nnz (not both). 1578 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1579 allocation. For large problems you MUST preallocate memory or you 1580 will get TERRIBLE performance, see the users' manual chapter on matrices. 1581 1582 By default, this format uses inodes (identical nodes) when possible, to 1583 improve numerical efficiency of matrix-vector products and solves. We 1584 search for consecutive rows with the same nonzero structure, thereby 1585 reusing matrix information to achieve increased efficiency. 1586 1587 Level: intermediate 1588 1589 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1590 @*/ 1591 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1592 { 1593 PetscErrorCode ierr; 1594 1595 PetscFunctionBegin; 1596 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1597 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1598 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1599 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1600 PetscFunctionReturn(0); 1601 } 1602 1603 #undef __FUNCT__ 1604 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE" 1605 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1606 { 1607 PetscErrorCode ierr; 1608 cusparseStatus_t stat; 1609 1610 PetscFunctionBegin; 1611 if (A->factortype==MAT_FACTOR_NONE) { 1612 try { 1613 if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) { 1614 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1615 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1616 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1617 1618 /* delete all the members in each of the data structures */ 1619 if (matstruct) { 1620 if (matstruct->descr) { stat = cusparseDestroyMatDescr(matstruct->descr);CHKERRCUSP(stat); } 1621 if (matstruct->mat) { 1622 if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1623 cusparseHybMat_t hybMat = (cusparseHybMat_t) matstruct->mat; 1624 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat); 1625 } else { 1626 CsrMatrix* mat = (CsrMatrix*)matstruct->mat; 1627 if (mat->values) delete (THRUSTARRAY*)mat->values; 1628 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1629 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1630 delete (CsrMatrix*)matstruct->mat; 1631 } 1632 } 1633 if (matstruct->cprowIndices) delete (THRUSTINTARRAY*)matstruct->cprowIndices; 1634 if (matstruct) delete (Mat_SeqAIJCUSPARSEMultStruct*)matstruct; 1635 } 1636 if (matstructT) { 1637 if (matstructT->descr) { stat = cusparseDestroyMatDescr(matstructT->descr);CHKERRCUSP(stat); } 1638 if (matstructT->mat) { 1639 if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1640 cusparseHybMat_t hybMat = (cusparseHybMat_t) matstructT->mat; 1641 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat); 1642 } else { 1643 CsrMatrix* matT = (CsrMatrix*)matstructT->mat; 1644 if (matT->values) delete (THRUSTARRAY*)matT->values; 1645 if (matT->column_indices) delete (THRUSTINTARRAY32*)matT->column_indices; 1646 if (matT->row_offsets) delete (THRUSTINTARRAY32*)matT->row_offsets; 1647 delete (CsrMatrix*)matstructT->mat; 1648 } 1649 } 1650 if (matstructT->cprowIndices) delete (THRUSTINTARRAY*)matstructT->cprowIndices; 1651 if (matstructT) delete (Mat_SeqAIJCUSPARSEMultStruct*)matstructT; 1652 } 1653 if (cusparsestruct->workVector) delete (THRUSTARRAY*)cusparsestruct->workVector; 1654 if (cusparsestruct->handle) { stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); } 1655 if (cusparsestruct) delete (Mat_SeqAIJCUSPARSE*)cusparsestruct; 1656 A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 1657 } else { 1658 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1659 if (cusparsestruct->handle) { stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); } 1660 if (cusparsestruct) delete (Mat_SeqAIJCUSPARSE*)cusparsestruct; 1661 } 1662 } catch(char *ex) { 1663 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1664 } 1665 } else { 1666 /* The triangular factors */ 1667 try { 1668 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1669 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1670 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1671 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1672 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1673 1674 /* delete all the members in each of the data structures */ 1675 if (loTriFactor) { 1676 if (loTriFactor->descr) { stat = cusparseDestroyMatDescr(loTriFactor->descr);CHKERRCUSP(stat); } 1677 if (loTriFactor->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(loTriFactor->solveInfo);CHKERRCUSP(stat); } 1678 if (loTriFactor->csrMat) { 1679 if (loTriFactor->csrMat->values) delete (THRUSTARRAY*)loTriFactor->csrMat->values; 1680 if (loTriFactor->csrMat->column_indices) delete (THRUSTINTARRAY32*)loTriFactor->csrMat->column_indices; 1681 if (loTriFactor->csrMat->row_offsets) delete (THRUSTINTARRAY32*)loTriFactor->csrMat->row_offsets; 1682 delete (CsrMatrix*)loTriFactor->csrMat; 1683 } 1684 if (loTriFactor) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)loTriFactor; 1685 } 1686 1687 if (upTriFactor) { 1688 if (upTriFactor->descr) { stat = cusparseDestroyMatDescr(upTriFactor->descr);CHKERRCUSP(stat); } 1689 if (upTriFactor->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(upTriFactor->solveInfo);CHKERRCUSP(stat); } 1690 if (upTriFactor->csrMat) { 1691 if (upTriFactor->csrMat->values) delete (THRUSTARRAY*)upTriFactor->csrMat->values; 1692 if (upTriFactor->csrMat->column_indices) delete (THRUSTINTARRAY32*)upTriFactor->csrMat->column_indices; 1693 if (upTriFactor->csrMat->row_offsets) delete (THRUSTINTARRAY32*)upTriFactor->csrMat->row_offsets; 1694 delete (CsrMatrix*)upTriFactor->csrMat; 1695 } 1696 if (upTriFactor) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)upTriFactor; 1697 } 1698 1699 if (loTriFactorT) { 1700 if (loTriFactorT->descr) { stat = cusparseDestroyMatDescr(loTriFactorT->descr);CHKERRCUSP(stat); } 1701 if (loTriFactorT->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(loTriFactorT->solveInfo);CHKERRCUSP(stat); } 1702 if (loTriFactorT->csrMat) { 1703 if (loTriFactorT->csrMat->values) delete (THRUSTARRAY*)loTriFactorT->csrMat->values; 1704 if (loTriFactorT->csrMat->column_indices) delete (THRUSTINTARRAY32*)loTriFactorT->csrMat->column_indices; 1705 if (loTriFactorT->csrMat->row_offsets) delete (THRUSTINTARRAY32*)loTriFactorT->csrMat->row_offsets; 1706 delete (CsrMatrix*)loTriFactorT->csrMat; 1707 } 1708 if (loTriFactorT) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)loTriFactorT; 1709 } 1710 1711 if (upTriFactorT) { 1712 if (upTriFactorT->descr) { stat = cusparseDestroyMatDescr(upTriFactorT->descr);CHKERRCUSP(stat); } 1713 if (upTriFactorT->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(upTriFactorT->solveInfo);CHKERRCUSP(stat); } 1714 if (upTriFactorT->csrMat) { 1715 if (upTriFactorT->csrMat->values) delete (THRUSTARRAY*)upTriFactorT->csrMat->values; 1716 if (upTriFactorT->csrMat->column_indices) delete (THRUSTINTARRAY32*)upTriFactorT->csrMat->column_indices; 1717 if (upTriFactorT->csrMat->row_offsets) delete (THRUSTINTARRAY32*)upTriFactorT->csrMat->row_offsets; 1718 delete (CsrMatrix*)upTriFactorT->csrMat; 1719 } 1720 if (upTriFactorT) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)upTriFactorT; 1721 } 1722 if (cusparseTriFactors->rpermIndices) delete (THRUSTINTARRAY*)cusparseTriFactors->rpermIndices; 1723 if (cusparseTriFactors->cpermIndices) delete (THRUSTINTARRAY*)cusparseTriFactors->cpermIndices; 1724 if (cusparseTriFactors->workVector) delete (THRUSTARRAY*)cusparseTriFactors->workVector; 1725 if (cusparseTriFactors->handle) { stat = cusparseDestroy(cusparseTriFactors->handle);CHKERRCUSP(stat); } 1726 if (cusparseTriFactors) delete (Mat_SeqAIJCUSPARSETriFactors*)cusparseTriFactors; 1727 } catch(char *ex) { 1728 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1729 } 1730 } 1731 /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */ 1732 A->spptr = 0; 1733 1734 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1735 PetscFunctionReturn(0); 1736 } 1737 1738 #undef __FUNCT__ 1739 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE" 1740 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 1741 { 1742 PetscErrorCode ierr; 1743 cusparseStatus_t stat; 1744 cusparseHandle_t handle=0; 1745 1746 PetscFunctionBegin; 1747 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 1748 if (B->factortype==MAT_FACTOR_NONE) { 1749 /* you cannot check the inode.use flag here since the matrix was just created. 1750 now build a GPU matrix data structure */ 1751 B->spptr = new Mat_SeqAIJCUSPARSE; 1752 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1753 ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0; 1754 ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0; 1755 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1756 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1757 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = 0; 1758 stat = cusparseCreate(&handle);CHKERRCUSP(stat); 1759 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle; 1760 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1761 } else { 1762 /* NEXT, set the pointers to the triangular factors */ 1763 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1764 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 1765 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1766 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0; 1767 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0; 1768 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0; 1769 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0; 1770 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0; 1771 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->format = cusparseMatSolveStorageFormat; 1772 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = 0; 1773 stat = cusparseCreate(&handle);CHKERRCUSP(stat); 1774 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle; 1775 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0; 1776 } 1777 1778 /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the 1779 default cusparse tri solve. Note the difference with the implementation in 1780 MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */ 1781 ierr = PetscObjectComposeFunction((PetscObject)B,"MatGetFactor_petsc_C",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr); 1782 1783 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1784 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1785 B->ops->getvecs = MatGetVecs_SeqAIJCUSPARSE; 1786 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1787 B->ops->mult = MatMult_SeqAIJCUSPARSE; 1788 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1789 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1790 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1791 1792 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1793 1794 B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 1795 1796 ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1797 PetscFunctionReturn(0); 1798 } 1799 1800 /*M 1801 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1802 1803 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1804 CSR, ELL, or Hybrid format. All matrix calculations are performed on Nvidia GPUs using 1805 the CUSPARSE library. 1806 1807 Options Database Keys: 1808 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1809 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1810 . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1811 - -mat_cusparse_solve_storage_format csr - sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1812 1813 Level: beginner 1814 1815 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1816 M*/ 1817