1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 6 #include "petscconf.h" 7 #include "../src/mat/impls/aij/seq/aij.h" /*I "petscmat.h" I*/ 8 #include <../src/mat/impls/sbaij/seq/sbaij.h> 9 #include "../src/vec/vec/impls/dvecimpl.h" 10 #include "petsc-private/vecimpl.h" 11 #undef VecType 12 #include "cusparsematimpl.h" 13 14 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 15 16 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 17 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 18 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 19 20 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 21 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 22 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 23 24 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 26 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 28 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat); 29 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 30 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 31 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 32 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 33 34 #undef __FUNCT__ 35 #define __FUNCT__ "MatCUSPARSESetStream" 36 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 37 { 38 cusparseStatus_t stat; 39 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 40 41 PetscFunctionBegin; 42 cusparsestruct->stream = stream; 43 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSP(stat); 44 PetscFunctionReturn(0); 45 } 46 47 #undef __FUNCT__ 48 #define __FUNCT__ "MatCUSPARSESetHandle" 49 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 50 { 51 cusparseStatus_t stat; 52 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 53 54 PetscFunctionBegin; 55 if (cusparsestruct->handle) 56 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); 57 cusparsestruct->handle = handle; 58 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat); 59 PetscFunctionReturn(0); 60 } 61 62 #undef __FUNCT__ 63 #define __FUNCT__ "MatCUSPARSEClearHandle" 64 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 65 { 66 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 67 PetscFunctionBegin; 68 if (cusparsestruct->handle) 69 cusparsestruct->handle = 0; 70 PetscFunctionReturn(0); 71 } 72 73 #undef __FUNCT__ 74 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse" 75 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type) 76 { 77 PetscFunctionBegin; 78 *type = MATSOLVERCUSPARSE; 79 PetscFunctionReturn(0); 80 } 81 82 /*MC 83 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 84 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 85 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 86 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 87 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 88 algorithms are not recommended. This class does NOT support direct solver operations. 89 90 Consult CUSPARSE documentation for more information about the matrix storage formats 91 which correspond to the options database keys below. 92 93 Options Database Keys: 94 . -mat_cusparse_solve_storage_format csr - sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 95 96 Level: beginner 97 98 .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 99 M*/ 100 101 #undef __FUNCT__ 102 #define __FUNCT__ "MatGetFactor_seqaij_cusparse" 103 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B) 104 { 105 PetscErrorCode ierr; 106 PetscInt n = A->rmap->n; 107 108 PetscFunctionBegin; 109 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 110 (*B)->factortype = ftype; 111 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 112 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 113 114 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 115 ierr = MatSetBlockSizes(*B,A->rmap->bs,A->cmap->bs);CHKERRQ(ierr); 116 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 117 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 118 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 119 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 120 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 121 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 122 123 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 124 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr); 125 PetscFunctionReturn(0); 126 } 127 128 #undef __FUNCT__ 129 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE" 130 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 131 { 132 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 133 134 PetscFunctionBegin; 135 switch (op) { 136 case MAT_CUSPARSE_MULT: 137 cusparsestruct->format = format; 138 break; 139 case MAT_CUSPARSE_SOLVE: 140 cusparseMatSolveStorageFormat = format; 141 break; 142 case MAT_CUSPARSE_ALL: 143 cusparsestruct->format = format; 144 cusparseMatSolveStorageFormat = format; 145 break; 146 default: 147 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL are currently supported.",op); 148 } 149 PetscFunctionReturn(0); 150 } 151 152 /*@ 153 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 154 operation. Only the MatMult operation can use different GPU storage formats 155 for MPIAIJCUSPARSE matrices. 156 Not Collective 157 158 Input Parameters: 159 + A - Matrix of type SEQAIJCUSPARSE 160 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 161 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB) 162 163 Output Parameter: 164 165 Level: intermediate 166 167 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 168 @*/ 169 #undef __FUNCT__ 170 #define __FUNCT__ "MatCUSPARSESetFormat" 171 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 172 { 173 PetscErrorCode ierr; 174 175 PetscFunctionBegin; 176 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 177 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 178 PetscFunctionReturn(0); 179 } 180 181 #undef __FUNCT__ 182 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE" 183 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A) 184 { 185 PetscErrorCode ierr; 186 MatCUSPARSEStorageFormat format; 187 PetscBool flg; 188 189 PetscFunctionBegin; 190 ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr); 191 ierr = PetscObjectOptionsBegin((PetscObject)A); 192 if (A->factortype==MAT_FACTOR_NONE) { 193 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 194 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 195 if (flg) { 196 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 197 } 198 } else { 199 ierr = PetscOptionsEnum("-mat_cusparse_solve_storage_format","sets storage format of (seq)aijcusparse gpu matrices for TriSolve", 200 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 201 if (flg) { 202 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_SOLVE,format);CHKERRQ(ierr); 203 } 204 } 205 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 206 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 207 if (flg) { 208 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 209 } 210 ierr = PetscOptionsEnd();CHKERRQ(ierr); 211 PetscFunctionReturn(0); 212 213 } 214 215 #undef __FUNCT__ 216 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE" 217 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 218 { 219 PetscErrorCode ierr; 220 221 PetscFunctionBegin; 222 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 223 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 224 PetscFunctionReturn(0); 225 } 226 227 #undef __FUNCT__ 228 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE" 229 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 230 { 231 PetscErrorCode ierr; 232 233 PetscFunctionBegin; 234 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 235 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 236 PetscFunctionReturn(0); 237 } 238 239 #undef __FUNCT__ 240 #define __FUNCT__ "MatICCFactorSymbolic_SeqAIJCUSPARSE" 241 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 242 { 243 PetscErrorCode ierr; 244 245 PetscFunctionBegin; 246 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 247 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 248 PetscFunctionReturn(0); 249 } 250 251 #undef __FUNCT__ 252 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqAIJCUSPARSE" 253 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 254 { 255 PetscErrorCode ierr; 256 257 PetscFunctionBegin; 258 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 259 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 260 PetscFunctionReturn(0); 261 } 262 263 #undef __FUNCT__ 264 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILULowerTriMatrix" 265 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 266 { 267 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 268 PetscInt n = A->rmap->n; 269 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 270 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 271 cusparseStatus_t stat; 272 const PetscInt *ai = a->i,*aj = a->j,*vi; 273 const MatScalar *aa = a->a,*v; 274 PetscInt *AiLo, *AjLo; 275 PetscScalar *AALo; 276 PetscInt i,nz, nzLower, offset, rowOffset; 277 PetscErrorCode ierr; 278 279 PetscFunctionBegin; 280 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 281 try { 282 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 283 nzLower=n+ai[n]-ai[1]; 284 285 /* Allocate Space for the lower triangular matrix */ 286 ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 287 ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr); 288 ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr); 289 290 /* Fill the lower triangular matrix */ 291 AiLo[0] = (PetscInt) 0; 292 AiLo[n] = nzLower; 293 AjLo[0] = (PetscInt) 0; 294 AALo[0] = (MatScalar) 1.0; 295 v = aa; 296 vi = aj; 297 offset = 1; 298 rowOffset= 1; 299 for (i=1; i<n; i++) { 300 nz = ai[i+1] - ai[i]; 301 /* additional 1 for the term on the diagonal */ 302 AiLo[i] = rowOffset; 303 rowOffset += nz+1; 304 305 ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 306 ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 307 308 offset += nz; 309 AjLo[offset] = (PetscInt) i; 310 AALo[offset] = (MatScalar) 1.0; 311 offset += 1; 312 313 v += nz; 314 vi += nz; 315 } 316 317 /* allocate space for the triangular factor information */ 318 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 319 320 /* Create the matrix description */ 321 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat); 322 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 323 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 324 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat); 325 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat); 326 327 /* Create the solve analysis information */ 328 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat); 329 330 /* set the operation */ 331 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 332 333 /* set the matrix */ 334 loTriFactor->csrMat = new CsrMatrix; 335 loTriFactor->csrMat->num_rows = n; 336 loTriFactor->csrMat->num_cols = n; 337 loTriFactor->csrMat->num_entries = nzLower; 338 339 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 340 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 341 342 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 343 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 344 345 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 346 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 347 348 /* perform the solve analysis */ 349 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 350 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 351 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 352 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat); 353 354 /* assign the pointer. Is this really necessary? */ 355 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 356 357 ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr); 358 ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr); 359 ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 360 } catch(char *ex) { 361 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 362 } 363 } 364 PetscFunctionReturn(0); 365 } 366 367 #undef __FUNCT__ 368 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILUUpperTriMatrix" 369 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 370 { 371 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 372 PetscInt n = A->rmap->n; 373 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 374 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 375 cusparseStatus_t stat; 376 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 377 const MatScalar *aa = a->a,*v; 378 PetscInt *AiUp, *AjUp; 379 PetscScalar *AAUp; 380 PetscInt i,nz, nzUpper, offset; 381 PetscErrorCode ierr; 382 383 PetscFunctionBegin; 384 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 385 try { 386 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 387 nzUpper = adiag[0]-adiag[n]; 388 389 /* Allocate Space for the upper triangular matrix */ 390 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 391 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 392 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 393 394 /* Fill the upper triangular matrix */ 395 AiUp[0]=(PetscInt) 0; 396 AiUp[n]=nzUpper; 397 offset = nzUpper; 398 for (i=n-1; i>=0; i--) { 399 v = aa + adiag[i+1] + 1; 400 vi = aj + adiag[i+1] + 1; 401 402 /* number of elements NOT on the diagonal */ 403 nz = adiag[i] - adiag[i+1]-1; 404 405 /* decrement the offset */ 406 offset -= (nz+1); 407 408 /* first, set the diagonal elements */ 409 AjUp[offset] = (PetscInt) i; 410 AAUp[offset] = 1./v[nz]; 411 AiUp[i] = AiUp[i+1] - (nz+1); 412 413 ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 414 ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 415 } 416 417 /* allocate space for the triangular factor information */ 418 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 419 420 /* Create the matrix description */ 421 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat); 422 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 423 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 424 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 425 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 426 427 /* Create the solve analysis information */ 428 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat); 429 430 /* set the operation */ 431 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 432 433 /* set the matrix */ 434 upTriFactor->csrMat = new CsrMatrix; 435 upTriFactor->csrMat->num_rows = n; 436 upTriFactor->csrMat->num_cols = n; 437 upTriFactor->csrMat->num_entries = nzUpper; 438 439 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 440 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 441 442 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 443 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 444 445 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 446 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 447 448 /* perform the solve analysis */ 449 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 450 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 451 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 452 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat); 453 454 /* assign the pointer. Is this really necessary? */ 455 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 456 457 ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 458 ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 459 ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 460 } catch(char *ex) { 461 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 462 } 463 } 464 PetscFunctionReturn(0); 465 } 466 467 #undef __FUNCT__ 468 #define __FUNCT__ "MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU" 469 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 470 { 471 PetscErrorCode ierr; 472 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 473 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 474 IS isrow = a->row,iscol = a->icol; 475 PetscBool row_identity,col_identity; 476 const PetscInt *r,*c; 477 PetscInt n = A->rmap->n; 478 479 PetscFunctionBegin; 480 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 481 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 482 483 cusparseTriFactors->workVector = new THRUSTARRAY; 484 cusparseTriFactors->workVector->resize(n); 485 cusparseTriFactors->nnz=a->nz; 486 487 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 488 /*lower triangular indices */ 489 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 490 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 491 if (!row_identity) { 492 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 493 cusparseTriFactors->rpermIndices->assign(r, r+n); 494 } 495 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 496 497 /*upper triangular indices */ 498 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 499 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 500 if (!col_identity) { 501 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 502 cusparseTriFactors->cpermIndices->assign(c, c+n); 503 } 504 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 505 PetscFunctionReturn(0); 506 } 507 508 #undef __FUNCT__ 509 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildICCTriMatrices" 510 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 511 { 512 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 513 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 514 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 515 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 516 cusparseStatus_t stat; 517 PetscErrorCode ierr; 518 PetscInt *AiUp, *AjUp; 519 PetscScalar *AAUp; 520 PetscScalar *AALo; 521 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 522 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 523 const PetscInt *ai = b->i,*aj = b->j,*vj; 524 const MatScalar *aa = b->a,*v; 525 526 PetscFunctionBegin; 527 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 528 try { 529 /* Allocate Space for the upper triangular matrix */ 530 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 531 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 532 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 533 ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 534 535 /* Fill the upper triangular matrix */ 536 AiUp[0]=(PetscInt) 0; 537 AiUp[n]=nzUpper; 538 offset = 0; 539 for (i=0; i<n; i++) { 540 /* set the pointers */ 541 v = aa + ai[i]; 542 vj = aj + ai[i]; 543 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 544 545 /* first, set the diagonal elements */ 546 AjUp[offset] = (PetscInt) i; 547 AAUp[offset] = 1.0/v[nz]; 548 AiUp[i] = offset; 549 AALo[offset] = 1.0/v[nz]; 550 551 offset+=1; 552 if (nz>0) { 553 ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr); 554 ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 555 for (j=offset; j<offset+nz; j++) { 556 AAUp[j] = -AAUp[j]; 557 AALo[j] = AAUp[j]/v[nz]; 558 } 559 offset+=nz; 560 } 561 } 562 563 /* allocate space for the triangular factor information */ 564 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 565 566 /* Create the matrix description */ 567 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat); 568 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 569 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 570 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 571 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat); 572 573 /* Create the solve analysis information */ 574 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat); 575 576 /* set the operation */ 577 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 578 579 /* set the matrix */ 580 upTriFactor->csrMat = new CsrMatrix; 581 upTriFactor->csrMat->num_rows = A->rmap->n; 582 upTriFactor->csrMat->num_cols = A->cmap->n; 583 upTriFactor->csrMat->num_entries = a->nz; 584 585 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 586 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 587 588 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 589 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 590 591 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 592 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 593 594 /* perform the solve analysis */ 595 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 596 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 597 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 598 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat); 599 600 /* assign the pointer. Is this really necessary? */ 601 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 602 603 /* allocate space for the triangular factor information */ 604 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 605 606 /* Create the matrix description */ 607 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat); 608 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 609 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 610 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 611 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 612 613 /* Create the solve analysis information */ 614 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat); 615 616 /* set the operation */ 617 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 618 619 /* set the matrix */ 620 loTriFactor->csrMat = new CsrMatrix; 621 loTriFactor->csrMat->num_rows = A->rmap->n; 622 loTriFactor->csrMat->num_cols = A->cmap->n; 623 loTriFactor->csrMat->num_entries = a->nz; 624 625 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 626 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 627 628 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 629 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 630 631 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 632 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 633 634 /* perform the solve analysis */ 635 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 636 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 637 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 638 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat); 639 640 /* assign the pointer. Is this really necessary? */ 641 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 642 643 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 644 ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 645 ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 646 ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 647 ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 648 } catch(char *ex) { 649 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 650 } 651 } 652 PetscFunctionReturn(0); 653 } 654 655 #undef __FUNCT__ 656 #define __FUNCT__ "MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU" 657 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 658 { 659 PetscErrorCode ierr; 660 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 661 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 662 IS ip = a->row; 663 const PetscInt *rip; 664 PetscBool perm_identity; 665 PetscInt n = A->rmap->n; 666 667 PetscFunctionBegin; 668 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 669 cusparseTriFactors->workVector = new THRUSTARRAY; 670 cusparseTriFactors->workVector->resize(n); 671 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 672 673 /*lower triangular indices */ 674 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 675 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 676 if (!perm_identity) { 677 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 678 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 679 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 680 cusparseTriFactors->cpermIndices->assign(rip, rip+n); 681 } 682 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 683 PetscFunctionReturn(0); 684 } 685 686 #undef __FUNCT__ 687 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE" 688 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 689 { 690 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 691 IS isrow = b->row,iscol = b->col; 692 PetscBool row_identity,col_identity; 693 PetscErrorCode ierr; 694 695 PetscFunctionBegin; 696 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 697 /* determine which version of MatSolve needs to be used. */ 698 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 699 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 700 if (row_identity && col_identity) { 701 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 702 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 703 } else { 704 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 705 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 706 } 707 708 /* get the triangular factors */ 709 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 710 PetscFunctionReturn(0); 711 } 712 713 #undef __FUNCT__ 714 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqAIJCUSPARSE" 715 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 716 { 717 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 718 IS ip = b->row; 719 PetscBool perm_identity; 720 PetscErrorCode ierr; 721 722 PetscFunctionBegin; 723 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 724 725 /* determine which version of MatSolve needs to be used. */ 726 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 727 if (perm_identity) { 728 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 729 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 730 } else { 731 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 732 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 733 } 734 735 /* get the triangular factors */ 736 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 737 PetscFunctionReturn(0); 738 } 739 740 #undef __FUNCT__ 741 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve" 742 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 743 { 744 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 745 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 746 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 747 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 748 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 749 cusparseStatus_t stat; 750 cusparseIndexBase_t indexBase; 751 cusparseMatrixType_t matrixType; 752 cusparseFillMode_t fillMode; 753 cusparseDiagType_t diagType; 754 755 PetscFunctionBegin; 756 757 /*********************************************/ 758 /* Now the Transpose of the Lower Tri Factor */ 759 /*********************************************/ 760 761 /* allocate space for the transpose of the lower triangular factor */ 762 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 763 764 /* set the matrix descriptors of the lower triangular factor */ 765 matrixType = cusparseGetMatType(loTriFactor->descr); 766 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 767 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 768 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 769 diagType = cusparseGetMatDiagType(loTriFactor->descr); 770 771 /* Create the matrix description */ 772 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSP(stat); 773 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSP(stat); 774 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSP(stat); 775 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSP(stat); 776 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSP(stat); 777 778 /* Create the solve analysis information */ 779 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSP(stat); 780 781 /* set the operation */ 782 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 783 784 /* allocate GPU space for the CSC of the lower triangular factor*/ 785 loTriFactorT->csrMat = new CsrMatrix; 786 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 787 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 788 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 789 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 790 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 791 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 792 793 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 794 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 795 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 796 loTriFactor->csrMat->values->data().get(), 797 loTriFactor->csrMat->row_offsets->data().get(), 798 loTriFactor->csrMat->column_indices->data().get(), 799 loTriFactorT->csrMat->values->data().get(), 800 loTriFactorT->csrMat->column_indices->data().get(), 801 loTriFactorT->csrMat->row_offsets->data().get(), 802 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 803 804 /* perform the solve analysis on the transposed matrix */ 805 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 806 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 807 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 808 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 809 loTriFactorT->solveInfo);CHKERRCUSP(stat); 810 811 /* assign the pointer. Is this really necessary? */ 812 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 813 814 /*********************************************/ 815 /* Now the Transpose of the Upper Tri Factor */ 816 /*********************************************/ 817 818 /* allocate space for the transpose of the upper triangular factor */ 819 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 820 821 /* set the matrix descriptors of the upper triangular factor */ 822 matrixType = cusparseGetMatType(upTriFactor->descr); 823 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 824 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 825 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 826 diagType = cusparseGetMatDiagType(upTriFactor->descr); 827 828 /* Create the matrix description */ 829 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSP(stat); 830 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSP(stat); 831 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSP(stat); 832 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSP(stat); 833 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSP(stat); 834 835 /* Create the solve analysis information */ 836 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSP(stat); 837 838 /* set the operation */ 839 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 840 841 /* allocate GPU space for the CSC of the upper triangular factor*/ 842 upTriFactorT->csrMat = new CsrMatrix; 843 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 844 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 845 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 846 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 847 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 848 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 849 850 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 851 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 852 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 853 upTriFactor->csrMat->values->data().get(), 854 upTriFactor->csrMat->row_offsets->data().get(), 855 upTriFactor->csrMat->column_indices->data().get(), 856 upTriFactorT->csrMat->values->data().get(), 857 upTriFactorT->csrMat->column_indices->data().get(), 858 upTriFactorT->csrMat->row_offsets->data().get(), 859 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 860 861 /* perform the solve analysis on the transposed matrix */ 862 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 863 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 864 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 865 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 866 upTriFactorT->solveInfo);CHKERRCUSP(stat); 867 868 /* assign the pointer. Is this really necessary? */ 869 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 870 PetscFunctionReturn(0); 871 } 872 873 #undef __FUNCT__ 874 #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult" 875 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 876 { 877 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 878 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 879 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 880 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 881 cusparseStatus_t stat; 882 cusparseIndexBase_t indexBase; 883 cudaError_t err; 884 885 PetscFunctionBegin; 886 887 /* allocate space for the triangular factor information */ 888 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 889 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSP(stat); 890 indexBase = cusparseGetMatIndexBase(matstruct->descr); 891 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSP(stat); 892 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat); 893 894 /* set alpha and beta */ 895 err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUSP(err); 896 err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 897 err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUSP(err); 898 err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 899 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat); 900 901 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 902 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 903 CsrMatrix *matrixT= new CsrMatrix; 904 matrixT->num_rows = A->rmap->n; 905 matrixT->num_cols = A->cmap->n; 906 matrixT->num_entries = a->nz; 907 matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 908 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 909 matrixT->values = new THRUSTARRAY(a->nz); 910 911 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 912 indexBase = cusparseGetMatIndexBase(matstruct->descr); 913 stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows, 914 matrix->num_cols, matrix->num_entries, 915 matrix->values->data().get(), 916 matrix->row_offsets->data().get(), 917 matrix->column_indices->data().get(), 918 matrixT->values->data().get(), 919 matrixT->column_indices->data().get(), 920 matrixT->row_offsets->data().get(), 921 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 922 923 /* assign the pointer */ 924 matstructT->mat = matrixT; 925 926 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 927 928 /* First convert HYB to CSR */ 929 CsrMatrix *temp= new CsrMatrix; 930 temp->num_rows = A->rmap->n; 931 temp->num_cols = A->cmap->n; 932 temp->num_entries = a->nz; 933 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 934 temp->column_indices = new THRUSTINTARRAY32(a->nz); 935 temp->values = new THRUSTARRAY(a->nz); 936 937 stat = cusparse_hyb2csr(cusparsestruct->handle, 938 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 939 temp->values->data().get(), 940 temp->row_offsets->data().get(), 941 temp->column_indices->data().get());CHKERRCUSP(stat); 942 943 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 944 CsrMatrix *tempT= new CsrMatrix; 945 tempT->num_rows = A->rmap->n; 946 tempT->num_cols = A->cmap->n; 947 tempT->num_entries = a->nz; 948 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 949 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 950 tempT->values = new THRUSTARRAY(a->nz); 951 952 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 953 temp->num_cols, temp->num_entries, 954 temp->values->data().get(), 955 temp->row_offsets->data().get(), 956 temp->column_indices->data().get(), 957 tempT->values->data().get(), 958 tempT->column_indices->data().get(), 959 tempT->row_offsets->data().get(), 960 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 961 962 /* Last, convert CSC to HYB */ 963 cusparseHybMat_t hybMat; 964 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat); 965 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 966 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 967 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 968 matstructT->descr, tempT->values->data().get(), 969 tempT->row_offsets->data().get(), 970 tempT->column_indices->data().get(), 971 hybMat, 0, partition);CHKERRCUSP(stat); 972 973 /* assign the pointer */ 974 matstructT->mat = hybMat; 975 976 /* delete temporaries */ 977 if (tempT) { 978 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 979 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 980 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 981 delete (CsrMatrix*) tempT; 982 } 983 if (temp) { 984 if (temp->values) delete (THRUSTARRAY*) temp->values; 985 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 986 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 987 delete (CsrMatrix*) temp; 988 } 989 } 990 /* assign the compressed row indices */ 991 matstructT->cprowIndices = new THRUSTINTARRAY; 992 993 /* assign the pointer */ 994 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 995 PetscFunctionReturn(0); 996 } 997 998 #undef __FUNCT__ 999 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE" 1000 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1001 { 1002 CUSPARRAY *xGPU, *bGPU; 1003 cusparseStatus_t stat; 1004 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1005 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1006 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1007 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1008 PetscErrorCode ierr; 1009 1010 PetscFunctionBegin; 1011 /* Analyze the matrix and create the transpose ... on the fly */ 1012 if (!loTriFactorT && !upTriFactorT) { 1013 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1014 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1015 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1016 } 1017 1018 /* Get the GPU pointers */ 1019 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1020 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1021 1022 /* First, reorder with the row permutation */ 1023 thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()), 1024 thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()), 1025 xGPU->begin()); 1026 1027 /* First, solve U */ 1028 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1029 upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 1030 upTriFactorT->csrMat->values->data().get(), 1031 upTriFactorT->csrMat->row_offsets->data().get(), 1032 upTriFactorT->csrMat->column_indices->data().get(), 1033 upTriFactorT->solveInfo, 1034 xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1035 1036 /* Then, solve L */ 1037 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1038 loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 1039 loTriFactorT->csrMat->values->data().get(), 1040 loTriFactorT->csrMat->row_offsets->data().get(), 1041 loTriFactorT->csrMat->column_indices->data().get(), 1042 loTriFactorT->solveInfo, 1043 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1044 1045 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1046 thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1047 thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()), 1048 tempGPU->begin()); 1049 1050 /* Copy the temporary to the full solution. */ 1051 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin()); 1052 1053 /* restore */ 1054 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1055 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1056 ierr = WaitForGPU();CHKERRCUSP(ierr); 1057 1058 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1059 PetscFunctionReturn(0); 1060 } 1061 1062 #undef __FUNCT__ 1063 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering" 1064 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1065 { 1066 CUSPARRAY *xGPU,*bGPU; 1067 cusparseStatus_t stat; 1068 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1069 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1070 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1071 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1072 PetscErrorCode ierr; 1073 1074 PetscFunctionBegin; 1075 /* Analyze the matrix and create the transpose ... on the fly */ 1076 if (!loTriFactorT && !upTriFactorT) { 1077 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1078 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1079 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1080 } 1081 1082 /* Get the GPU pointers */ 1083 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1084 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1085 1086 /* First, solve U */ 1087 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1088 upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 1089 upTriFactorT->csrMat->values->data().get(), 1090 upTriFactorT->csrMat->row_offsets->data().get(), 1091 upTriFactorT->csrMat->column_indices->data().get(), 1092 upTriFactorT->solveInfo, 1093 bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1094 1095 /* Then, solve L */ 1096 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1097 loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 1098 loTriFactorT->csrMat->values->data().get(), 1099 loTriFactorT->csrMat->row_offsets->data().get(), 1100 loTriFactorT->csrMat->column_indices->data().get(), 1101 loTriFactorT->solveInfo, 1102 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1103 1104 /* restore */ 1105 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1106 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1107 ierr = WaitForGPU();CHKERRCUSP(ierr); 1108 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1109 PetscFunctionReturn(0); 1110 } 1111 1112 #undef __FUNCT__ 1113 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE" 1114 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1115 { 1116 CUSPARRAY *xGPU,*bGPU; 1117 cusparseStatus_t stat; 1118 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1119 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1120 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1121 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1122 PetscErrorCode ierr; 1123 1124 PetscFunctionBegin; 1125 /* Get the GPU pointers */ 1126 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1127 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1128 1129 /* First, reorder with the row permutation */ 1130 thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()), 1131 thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()), 1132 xGPU->begin()); 1133 1134 /* Next, solve L */ 1135 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1136 loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1137 loTriFactor->csrMat->values->data().get(), 1138 loTriFactor->csrMat->row_offsets->data().get(), 1139 loTriFactor->csrMat->column_indices->data().get(), 1140 loTriFactor->solveInfo, 1141 xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1142 1143 /* Then, solve U */ 1144 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1145 upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1146 upTriFactor->csrMat->values->data().get(), 1147 upTriFactor->csrMat->row_offsets->data().get(), 1148 upTriFactor->csrMat->column_indices->data().get(), 1149 upTriFactor->solveInfo, 1150 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1151 1152 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1153 thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1154 thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()), 1155 tempGPU->begin()); 1156 1157 /* Copy the temporary to the full solution. */ 1158 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin()); 1159 1160 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1161 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1162 ierr = WaitForGPU();CHKERRCUSP(ierr); 1163 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1164 PetscFunctionReturn(0); 1165 } 1166 1167 #undef __FUNCT__ 1168 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering" 1169 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1170 { 1171 CUSPARRAY *xGPU,*bGPU; 1172 cusparseStatus_t stat; 1173 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1174 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1175 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1176 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1177 PetscErrorCode ierr; 1178 1179 PetscFunctionBegin; 1180 /* Get the GPU pointers */ 1181 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1182 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1183 1184 /* First, solve L */ 1185 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1186 loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1187 loTriFactor->csrMat->values->data().get(), 1188 loTriFactor->csrMat->row_offsets->data().get(), 1189 loTriFactor->csrMat->column_indices->data().get(), 1190 loTriFactor->solveInfo, 1191 bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1192 1193 /* Next, solve U */ 1194 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1195 upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1196 upTriFactor->csrMat->values->data().get(), 1197 upTriFactor->csrMat->row_offsets->data().get(), 1198 upTriFactor->csrMat->column_indices->data().get(), 1199 upTriFactor->solveInfo, 1200 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1201 1202 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1203 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1204 ierr = WaitForGPU();CHKERRCUSP(ierr); 1205 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1206 PetscFunctionReturn(0); 1207 } 1208 1209 #undef __FUNCT__ 1210 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU" 1211 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1212 { 1213 1214 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1215 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1216 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1217 PetscInt m = A->rmap->n,*ii,*ridx; 1218 PetscErrorCode ierr; 1219 cusparseStatus_t stat; 1220 cudaError_t err; 1221 1222 PetscFunctionBegin; 1223 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 1224 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1225 if (matstruct) { 1226 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Data structure should not be initialized."); 1227 } 1228 try { 1229 cusparsestruct->nonzerorow=0; 1230 for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0); 1231 1232 if (a->compressedrow.use) { 1233 m = a->compressedrow.nrows; 1234 ii = a->compressedrow.i; 1235 ridx = a->compressedrow.rindex; 1236 } else { 1237 /* Forcing compressed row on the GPU */ 1238 int k=0; 1239 ierr = PetscMalloc((cusparsestruct->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr); 1240 ierr = PetscMalloc((cusparsestruct->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr); 1241 ii[0]=0; 1242 for (int j = 0; j<m; j++) { 1243 if ((a->i[j+1]-a->i[j])>0) { 1244 ii[k] = a->i[j]; 1245 ridx[k]= j; 1246 k++; 1247 } 1248 } 1249 ii[cusparsestruct->nonzerorow] = a->nz; 1250 m = cusparsestruct->nonzerorow; 1251 } 1252 1253 /* allocate space for the triangular factor information */ 1254 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1255 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSP(stat); 1256 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 1257 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat); 1258 1259 err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUSP(err); 1260 err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 1261 err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUSP(err); 1262 err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 1263 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat); 1264 1265 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1266 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1267 /* set the matrix */ 1268 CsrMatrix *matrix= new CsrMatrix; 1269 matrix->num_rows = A->rmap->n; 1270 matrix->num_cols = A->cmap->n; 1271 matrix->num_entries = a->nz; 1272 matrix->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1273 matrix->row_offsets->assign(ii, ii + cusparsestruct->nonzerorow+1); 1274 1275 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1276 matrix->column_indices->assign(a->j, a->j+a->nz); 1277 1278 matrix->values = new THRUSTARRAY(a->nz); 1279 matrix->values->assign(a->a, a->a+a->nz); 1280 1281 /* assign the pointer */ 1282 matstruct->mat = matrix; 1283 1284 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1285 1286 CsrMatrix *matrix= new CsrMatrix; 1287 matrix->num_rows = A->rmap->n; 1288 matrix->num_cols = A->cmap->n; 1289 matrix->num_entries = a->nz; 1290 matrix->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1291 matrix->row_offsets->assign(ii, ii + cusparsestruct->nonzerorow+1); 1292 1293 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1294 matrix->column_indices->assign(a->j, a->j+a->nz); 1295 1296 matrix->values = new THRUSTARRAY(a->nz); 1297 matrix->values->assign(a->a, a->a+a->nz); 1298 1299 cusparseHybMat_t hybMat; 1300 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat); 1301 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1302 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1303 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1304 matstruct->descr, matrix->values->data().get(), 1305 matrix->row_offsets->data().get(), 1306 matrix->column_indices->data().get(), 1307 hybMat, 0, partition);CHKERRCUSP(stat); 1308 /* assign the pointer */ 1309 matstruct->mat = hybMat; 1310 1311 if (matrix) { 1312 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1313 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1314 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1315 delete (CsrMatrix*)matrix; 1316 } 1317 } 1318 1319 /* assign the compressed row indices */ 1320 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1321 matstruct->cprowIndices->assign(ridx,ridx+m); 1322 1323 /* assign the pointer */ 1324 cusparsestruct->mat = matstruct; 1325 1326 if (!a->compressedrow.use) { 1327 ierr = PetscFree(ii);CHKERRQ(ierr); 1328 ierr = PetscFree(ridx);CHKERRQ(ierr); 1329 } 1330 cusparsestruct->workVector = new THRUSTARRAY; 1331 cusparsestruct->workVector->resize(m); 1332 } catch(char *ex) { 1333 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1334 } 1335 ierr = WaitForGPU();CHKERRCUSP(ierr); 1336 1337 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 1338 1339 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1340 } 1341 PetscFunctionReturn(0); 1342 } 1343 1344 #undef __FUNCT__ 1345 #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE" 1346 static PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left) 1347 { 1348 PetscErrorCode ierr; 1349 1350 PetscFunctionBegin; 1351 if (right) { 1352 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 1353 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 1354 ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr); 1355 ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr); 1356 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 1357 } 1358 if (left) { 1359 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 1360 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 1361 ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr); 1362 ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr); 1363 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 1364 } 1365 PetscFunctionReturn(0); 1366 } 1367 1368 struct VecCUSPPlusEquals 1369 { 1370 template <typename Tuple> 1371 __host__ __device__ 1372 void operator()(Tuple t) 1373 { 1374 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1375 } 1376 }; 1377 1378 #undef __FUNCT__ 1379 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE" 1380 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1381 { 1382 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1383 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1384 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1385 CUSPARRAY *xarray,*yarray; 1386 PetscErrorCode ierr; 1387 cusparseStatus_t stat; 1388 1389 PetscFunctionBegin; 1390 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1391 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1392 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1393 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1394 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1395 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1396 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1397 mat->num_rows, mat->num_cols, mat->num_entries, 1398 matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), 1399 mat->column_indices->data().get(), xarray->data().get(), matstruct->beta, 1400 yarray->data().get());CHKERRCUSP(stat); 1401 } else { 1402 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1403 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1404 matstruct->alpha, matstruct->descr, hybMat, 1405 xarray->data().get(), matstruct->beta, 1406 yarray->data().get());CHKERRCUSP(stat); 1407 } 1408 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1409 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1410 if (!cusparsestruct->stream) { 1411 ierr = WaitForGPU();CHKERRCUSP(ierr); 1412 } 1413 ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1414 PetscFunctionReturn(0); 1415 } 1416 1417 #undef __FUNCT__ 1418 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE" 1419 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1420 { 1421 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1422 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1423 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1424 CUSPARRAY *xarray,*yarray; 1425 PetscErrorCode ierr; 1426 cusparseStatus_t stat; 1427 1428 PetscFunctionBegin; 1429 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1430 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1431 if (!matstructT) { 1432 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1433 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1434 } 1435 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1436 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1437 1438 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1439 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1440 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1441 mat->num_rows, mat->num_cols, 1442 mat->num_entries, matstructT->alpha, matstructT->descr, 1443 mat->values->data().get(), mat->row_offsets->data().get(), 1444 mat->column_indices->data().get(), xarray->data().get(), matstructT->beta, 1445 yarray->data().get());CHKERRCUSP(stat); 1446 } else { 1447 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1448 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1449 matstructT->alpha, matstructT->descr, hybMat, 1450 xarray->data().get(), matstructT->beta, 1451 yarray->data().get());CHKERRCUSP(stat); 1452 } 1453 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1454 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1455 if (!cusparsestruct->stream) { 1456 ierr = WaitForGPU();CHKERRCUSP(ierr); 1457 } 1458 ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1459 PetscFunctionReturn(0); 1460 } 1461 1462 1463 #undef __FUNCT__ 1464 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 1465 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1466 { 1467 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1468 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1469 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1470 CUSPARRAY *xarray,*yarray,*zarray; 1471 PetscErrorCode ierr; 1472 cusparseStatus_t stat; 1473 1474 PetscFunctionBegin; 1475 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1476 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1477 try { 1478 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 1479 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1480 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 1481 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1482 1483 /* multiply add */ 1484 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1485 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1486 /* here we need to be careful to set the number of rows in the multiply to the 1487 number of compressed rows in the matrix ... which is equivalent to the 1488 size of the workVector */ 1489 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1490 cusparsestruct->workVector->size(), mat->num_cols, 1491 mat->num_entries, matstruct->alpha, matstruct->descr, 1492 mat->values->data().get(), mat->row_offsets->data().get(), 1493 mat->column_indices->data().get(), xarray->data().get(), matstruct->beta, 1494 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1495 } else { 1496 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1497 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1498 matstruct->alpha, matstruct->descr, hybMat, 1499 xarray->data().get(), matstruct->beta, 1500 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1501 } 1502 1503 /* scatter the data from the temporary into the full vector with a += operation */ 1504 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))), 1505 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1506 VecCUSPPlusEquals()); 1507 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1508 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 1509 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1510 1511 } catch(char *ex) { 1512 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1513 } 1514 ierr = WaitForGPU();CHKERRCUSP(ierr); 1515 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1516 PetscFunctionReturn(0); 1517 } 1518 1519 #undef __FUNCT__ 1520 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJCUSPARSE" 1521 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1522 { 1523 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1524 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1525 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1526 CUSPARRAY *xarray,*yarray,*zarray; 1527 PetscErrorCode ierr; 1528 cusparseStatus_t stat; 1529 1530 PetscFunctionBegin; 1531 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1532 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1533 if (!matstructT) { 1534 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1535 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1536 } 1537 1538 try { 1539 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 1540 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1541 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 1542 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1543 1544 /* multiply add with matrix transpose */ 1545 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1546 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1547 /* here we need to be careful to set the number of rows in the multiply to the 1548 number of compressed rows in the matrix ... which is equivalent to the 1549 size of the workVector */ 1550 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1551 cusparsestruct->workVector->size(), mat->num_cols, 1552 mat->num_entries, matstructT->alpha, matstructT->descr, 1553 mat->values->data().get(), mat->row_offsets->data().get(), 1554 mat->column_indices->data().get(), xarray->data().get(), matstructT->beta, 1555 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1556 } else { 1557 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1558 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1559 matstructT->alpha, matstructT->descr, hybMat, 1560 xarray->data().get(), matstructT->beta, 1561 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1562 } 1563 1564 /* scatter the data from the temporary into the full vector with a += operation */ 1565 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))), 1566 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1567 VecCUSPPlusEquals()); 1568 1569 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1570 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 1571 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1572 1573 } catch(char *ex) { 1574 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1575 } 1576 ierr = WaitForGPU();CHKERRCUSP(ierr); 1577 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1578 PetscFunctionReturn(0); 1579 } 1580 1581 #undef __FUNCT__ 1582 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE" 1583 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1584 { 1585 PetscErrorCode ierr; 1586 1587 PetscFunctionBegin; 1588 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1589 if (A->factortype==MAT_FACTOR_NONE) { 1590 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1591 } 1592 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1593 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1594 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1595 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1596 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1597 PetscFunctionReturn(0); 1598 } 1599 1600 /* --------------------------------------------------------------------------------*/ 1601 #undef __FUNCT__ 1602 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE" 1603 /*@ 1604 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1605 (the default parallel PETSc format). This matrix will ultimately pushed down 1606 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1607 assembly performance the user should preallocate the matrix storage by setting 1608 the parameter nz (or the array nnz). By setting these parameters accurately, 1609 performance during matrix assembly can be increased by more than a factor of 50. 1610 1611 Collective on MPI_Comm 1612 1613 Input Parameters: 1614 + comm - MPI communicator, set to PETSC_COMM_SELF 1615 . m - number of rows 1616 . n - number of columns 1617 . nz - number of nonzeros per row (same for all rows) 1618 - nnz - array containing the number of nonzeros in the various rows 1619 (possibly different for each row) or NULL 1620 1621 Output Parameter: 1622 . A - the matrix 1623 1624 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1625 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1626 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1627 1628 Notes: 1629 If nnz is given then nz is ignored 1630 1631 The AIJ format (also called the Yale sparse matrix format or 1632 compressed row storage), is fully compatible with standard Fortran 77 1633 storage. That is, the stored row and column indices can begin at 1634 either one (as in Fortran) or zero. See the users' manual for details. 1635 1636 Specify the preallocated storage with either nz or nnz (not both). 1637 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1638 allocation. For large problems you MUST preallocate memory or you 1639 will get TERRIBLE performance, see the users' manual chapter on matrices. 1640 1641 By default, this format uses inodes (identical nodes) when possible, to 1642 improve numerical efficiency of matrix-vector products and solves. We 1643 search for consecutive rows with the same nonzero structure, thereby 1644 reusing matrix information to achieve increased efficiency. 1645 1646 Level: intermediate 1647 1648 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1649 @*/ 1650 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1651 { 1652 PetscErrorCode ierr; 1653 1654 PetscFunctionBegin; 1655 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1656 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1657 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1658 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1659 PetscFunctionReturn(0); 1660 } 1661 1662 #undef __FUNCT__ 1663 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE" 1664 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1665 { 1666 PetscErrorCode ierr; 1667 cusparseStatus_t stat; 1668 cudaError_t err; 1669 PetscFunctionBegin; 1670 if (A->factortype==MAT_FACTOR_NONE) { 1671 try { 1672 if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) { 1673 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1674 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1675 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1676 1677 /* delete all the members in each of the data structures */ 1678 if (matstruct) { 1679 if (matstruct->descr) { stat = cusparseDestroyMatDescr(matstruct->descr);CHKERRCUSP(stat); } 1680 if (matstruct->mat) { 1681 if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1682 cusparseHybMat_t hybMat = (cusparseHybMat_t) matstruct->mat; 1683 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat); 1684 } else { 1685 CsrMatrix* mat = (CsrMatrix*)matstruct->mat; 1686 if (mat->values) delete (THRUSTARRAY*)mat->values; 1687 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1688 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1689 delete (CsrMatrix*)matstruct->mat; 1690 } 1691 } 1692 if (matstruct->alpha) { err=cudaFree(matstruct->alpha);CHKERRCUSP(err); } 1693 if (matstruct->beta) { err=cudaFree(matstruct->beta);CHKERRCUSP(err); } 1694 if (matstruct->cprowIndices) delete (THRUSTINTARRAY*)matstruct->cprowIndices; 1695 if (matstruct) delete (Mat_SeqAIJCUSPARSEMultStruct*)matstruct; 1696 } 1697 if (matstructT) { 1698 if (matstructT->descr) { stat = cusparseDestroyMatDescr(matstructT->descr);CHKERRCUSP(stat); } 1699 if (matstructT->mat) { 1700 if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1701 cusparseHybMat_t hybMat = (cusparseHybMat_t) matstructT->mat; 1702 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat); 1703 } else { 1704 CsrMatrix* matT = (CsrMatrix*)matstructT->mat; 1705 if (matT->values) delete (THRUSTARRAY*)matT->values; 1706 if (matT->column_indices) delete (THRUSTINTARRAY32*)matT->column_indices; 1707 if (matT->row_offsets) delete (THRUSTINTARRAY32*)matT->row_offsets; 1708 delete (CsrMatrix*)matstructT->mat; 1709 } 1710 } 1711 if (matstructT->alpha) { err=cudaFree(matstructT->alpha);CHKERRCUSP(err); } 1712 if (matstructT->beta) { err=cudaFree(matstructT->beta);CHKERRCUSP(err); } 1713 if (matstructT->cprowIndices) delete (THRUSTINTARRAY*)matstructT->cprowIndices; 1714 if (matstructT) delete (Mat_SeqAIJCUSPARSEMultStruct*)matstructT; 1715 } 1716 if (cusparsestruct->workVector) delete (THRUSTARRAY*)cusparsestruct->workVector; 1717 if (cusparsestruct->handle) { stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); } 1718 if (cusparsestruct) delete (Mat_SeqAIJCUSPARSE*)cusparsestruct; 1719 A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 1720 } else { 1721 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1722 if (cusparsestruct->handle) { stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); } 1723 if (cusparsestruct) delete (Mat_SeqAIJCUSPARSE*)cusparsestruct; 1724 } 1725 } catch(char *ex) { 1726 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1727 } 1728 } else { 1729 /* The triangular factors */ 1730 try { 1731 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1732 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1733 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1734 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1735 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1736 1737 /* delete all the members in each of the data structures */ 1738 if (loTriFactor) { 1739 if (loTriFactor->descr) { stat = cusparseDestroyMatDescr(loTriFactor->descr);CHKERRCUSP(stat); } 1740 if (loTriFactor->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(loTriFactor->solveInfo);CHKERRCUSP(stat); } 1741 if (loTriFactor->csrMat) { 1742 if (loTriFactor->csrMat->values) delete (THRUSTARRAY*)loTriFactor->csrMat->values; 1743 if (loTriFactor->csrMat->column_indices) delete (THRUSTINTARRAY32*)loTriFactor->csrMat->column_indices; 1744 if (loTriFactor->csrMat->row_offsets) delete (THRUSTINTARRAY32*)loTriFactor->csrMat->row_offsets; 1745 delete (CsrMatrix*)loTriFactor->csrMat; 1746 } 1747 if (loTriFactor) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)loTriFactor; 1748 } 1749 1750 if (upTriFactor) { 1751 if (upTriFactor->descr) { stat = cusparseDestroyMatDescr(upTriFactor->descr);CHKERRCUSP(stat); } 1752 if (upTriFactor->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(upTriFactor->solveInfo);CHKERRCUSP(stat); } 1753 if (upTriFactor->csrMat) { 1754 if (upTriFactor->csrMat->values) delete (THRUSTARRAY*)upTriFactor->csrMat->values; 1755 if (upTriFactor->csrMat->column_indices) delete (THRUSTINTARRAY32*)upTriFactor->csrMat->column_indices; 1756 if (upTriFactor->csrMat->row_offsets) delete (THRUSTINTARRAY32*)upTriFactor->csrMat->row_offsets; 1757 delete (CsrMatrix*)upTriFactor->csrMat; 1758 } 1759 if (upTriFactor) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)upTriFactor; 1760 } 1761 1762 if (loTriFactorT) { 1763 if (loTriFactorT->descr) { stat = cusparseDestroyMatDescr(loTriFactorT->descr);CHKERRCUSP(stat); } 1764 if (loTriFactorT->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(loTriFactorT->solveInfo);CHKERRCUSP(stat); } 1765 if (loTriFactorT->csrMat) { 1766 if (loTriFactorT->csrMat->values) delete (THRUSTARRAY*)loTriFactorT->csrMat->values; 1767 if (loTriFactorT->csrMat->column_indices) delete (THRUSTINTARRAY32*)loTriFactorT->csrMat->column_indices; 1768 if (loTriFactorT->csrMat->row_offsets) delete (THRUSTINTARRAY32*)loTriFactorT->csrMat->row_offsets; 1769 delete (CsrMatrix*)loTriFactorT->csrMat; 1770 } 1771 if (loTriFactorT) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)loTriFactorT; 1772 } 1773 1774 if (upTriFactorT) { 1775 if (upTriFactorT->descr) { stat = cusparseDestroyMatDescr(upTriFactorT->descr);CHKERRCUSP(stat); } 1776 if (upTriFactorT->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(upTriFactorT->solveInfo);CHKERRCUSP(stat); } 1777 if (upTriFactorT->csrMat) { 1778 if (upTriFactorT->csrMat->values) delete (THRUSTARRAY*)upTriFactorT->csrMat->values; 1779 if (upTriFactorT->csrMat->column_indices) delete (THRUSTINTARRAY32*)upTriFactorT->csrMat->column_indices; 1780 if (upTriFactorT->csrMat->row_offsets) delete (THRUSTINTARRAY32*)upTriFactorT->csrMat->row_offsets; 1781 delete (CsrMatrix*)upTriFactorT->csrMat; 1782 } 1783 if (upTriFactorT) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)upTriFactorT; 1784 } 1785 if (cusparseTriFactors->rpermIndices) delete (THRUSTINTARRAY*)cusparseTriFactors->rpermIndices; 1786 if (cusparseTriFactors->cpermIndices) delete (THRUSTINTARRAY*)cusparseTriFactors->cpermIndices; 1787 if (cusparseTriFactors->workVector) delete (THRUSTARRAY*)cusparseTriFactors->workVector; 1788 if (cusparseTriFactors->handle) { stat = cusparseDestroy(cusparseTriFactors->handle);CHKERRCUSP(stat); } 1789 if (cusparseTriFactors) delete (Mat_SeqAIJCUSPARSETriFactors*)cusparseTriFactors; 1790 } catch(char *ex) { 1791 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1792 } 1793 } 1794 /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */ 1795 A->spptr = 0; 1796 1797 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1798 PetscFunctionReturn(0); 1799 } 1800 1801 #undef __FUNCT__ 1802 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE" 1803 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 1804 { 1805 PetscErrorCode ierr; 1806 cusparseStatus_t stat; 1807 cusparseHandle_t handle=0; 1808 1809 PetscFunctionBegin; 1810 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 1811 if (B->factortype==MAT_FACTOR_NONE) { 1812 /* you cannot check the inode.use flag here since the matrix was just created. 1813 now build a GPU matrix data structure */ 1814 B->spptr = new Mat_SeqAIJCUSPARSE; 1815 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1816 ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0; 1817 ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0; 1818 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1819 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1820 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = 0; 1821 stat = cusparseCreate(&handle);CHKERRCUSP(stat); 1822 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle; 1823 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1824 } else { 1825 /* NEXT, set the pointers to the triangular factors */ 1826 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1827 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 1828 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1829 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0; 1830 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0; 1831 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0; 1832 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0; 1833 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0; 1834 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->format = cusparseMatSolveStorageFormat; 1835 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = 0; 1836 stat = cusparseCreate(&handle);CHKERRCUSP(stat); 1837 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle; 1838 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0; 1839 } 1840 1841 /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the 1842 default cusparse tri solve. Note the difference with the implementation in 1843 MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */ 1844 ierr = PetscObjectComposeFunction((PetscObject)B,"MatGetFactor_petsc_C",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr); 1845 1846 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1847 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1848 B->ops->getvecs = MatGetVecs_SeqAIJCUSPARSE; 1849 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1850 B->ops->mult = MatMult_SeqAIJCUSPARSE; 1851 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1852 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1853 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1854 1855 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1856 1857 B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 1858 1859 ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1860 PetscFunctionReturn(0); 1861 } 1862 1863 /*M 1864 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1865 1866 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1867 CSR, ELL, or Hybrid format. All matrix calculations are performed on Nvidia GPUs using 1868 the CUSPARSE library. 1869 1870 Options Database Keys: 1871 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1872 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1873 . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1874 - -mat_cusparse_solve_storage_format csr - sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1875 1876 Level: beginner 1877 1878 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1879 M*/ 1880