1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 7 #include <petscconf.h> 8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 9 #include <../src/mat/impls/sbaij/seq/sbaij.h> 10 #include <../src/vec/vec/impls/dvecimpl.h> 11 #include <petsc/private/vecimpl.h> 12 #undef VecType 13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 14 15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 16 17 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 18 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 19 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 20 21 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 22 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 23 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 24 25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 29 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 30 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 31 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 32 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 33 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 34 35 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 36 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 37 static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 38 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 39 static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 40 41 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 42 { 43 cusparseStatus_t stat; 44 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 45 46 PetscFunctionBegin; 47 cusparsestruct->stream = stream; 48 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat); 49 PetscFunctionReturn(0); 50 } 51 52 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 53 { 54 cusparseStatus_t stat; 55 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 56 57 PetscFunctionBegin; 58 if (cusparsestruct->handle) { 59 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat); 60 } 61 cusparsestruct->handle = handle; 62 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 63 PetscFunctionReturn(0); 64 } 65 66 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 67 { 68 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 69 PetscFunctionBegin; 70 if (cusparsestruct->handle) 71 cusparsestruct->handle = 0; 72 PetscFunctionReturn(0); 73 } 74 75 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type) 76 { 77 PetscFunctionBegin; 78 *type = MATSOLVERCUSPARSE; 79 PetscFunctionReturn(0); 80 } 81 82 /*MC 83 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 84 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 85 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 86 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 87 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 88 algorithms are not recommended. This class does NOT support direct solver operations. 89 90 Level: beginner 91 92 .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 93 M*/ 94 95 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 96 { 97 PetscErrorCode ierr; 98 PetscInt n = A->rmap->n; 99 100 PetscFunctionBegin; 101 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 102 (*B)->factortype = ftype; 103 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 104 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 105 106 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 107 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 108 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 109 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 110 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 111 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 112 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 113 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 114 115 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 116 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr); 117 PetscFunctionReturn(0); 118 } 119 120 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 121 { 122 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 123 124 PetscFunctionBegin; 125 #if CUDA_VERSION>=4020 126 switch (op) { 127 case MAT_CUSPARSE_MULT: 128 cusparsestruct->format = format; 129 break; 130 case MAT_CUSPARSE_ALL: 131 cusparsestruct->format = format; 132 break; 133 default: 134 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 135 } 136 #else 137 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later."); 138 #endif 139 PetscFunctionReturn(0); 140 } 141 142 /*@ 143 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 144 operation. Only the MatMult operation can use different GPU storage formats 145 for MPIAIJCUSPARSE matrices. 146 Not Collective 147 148 Input Parameters: 149 + A - Matrix of type SEQAIJCUSPARSE 150 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 151 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 152 153 Output Parameter: 154 155 Level: intermediate 156 157 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 158 @*/ 159 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 160 { 161 PetscErrorCode ierr; 162 163 PetscFunctionBegin; 164 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 165 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 166 PetscFunctionReturn(0); 167 } 168 169 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 170 { 171 PetscErrorCode ierr; 172 MatCUSPARSEStorageFormat format; 173 PetscBool flg; 174 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 175 176 PetscFunctionBegin; 177 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 178 ierr = PetscObjectOptionsBegin((PetscObject)A); 179 if (A->factortype==MAT_FACTOR_NONE) { 180 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 181 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 182 if (flg) { 183 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 184 } 185 } 186 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 187 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 188 if (flg) { 189 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 190 } 191 ierr = PetscOptionsEnd();CHKERRQ(ierr); 192 PetscFunctionReturn(0); 193 194 } 195 196 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 197 { 198 PetscErrorCode ierr; 199 200 PetscFunctionBegin; 201 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 202 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 203 PetscFunctionReturn(0); 204 } 205 206 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 207 { 208 PetscErrorCode ierr; 209 210 PetscFunctionBegin; 211 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 212 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 213 PetscFunctionReturn(0); 214 } 215 216 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 217 { 218 PetscErrorCode ierr; 219 220 PetscFunctionBegin; 221 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 222 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 223 PetscFunctionReturn(0); 224 } 225 226 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 227 { 228 PetscErrorCode ierr; 229 230 PetscFunctionBegin; 231 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 232 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 233 PetscFunctionReturn(0); 234 } 235 236 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 237 { 238 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 239 PetscInt n = A->rmap->n; 240 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 241 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 242 cusparseStatus_t stat; 243 const PetscInt *ai = a->i,*aj = a->j,*vi; 244 const MatScalar *aa = a->a,*v; 245 PetscInt *AiLo, *AjLo; 246 PetscScalar *AALo; 247 PetscInt i,nz, nzLower, offset, rowOffset; 248 PetscErrorCode ierr; 249 250 PetscFunctionBegin; 251 if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) { 252 try { 253 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 254 nzLower=n+ai[n]-ai[1]; 255 256 /* Allocate Space for the lower triangular matrix */ 257 ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 258 ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr); 259 ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr); 260 261 /* Fill the lower triangular matrix */ 262 AiLo[0] = (PetscInt) 0; 263 AiLo[n] = nzLower; 264 AjLo[0] = (PetscInt) 0; 265 AALo[0] = (MatScalar) 1.0; 266 v = aa; 267 vi = aj; 268 offset = 1; 269 rowOffset= 1; 270 for (i=1; i<n; i++) { 271 nz = ai[i+1] - ai[i]; 272 /* additional 1 for the term on the diagonal */ 273 AiLo[i] = rowOffset; 274 rowOffset += nz+1; 275 276 ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 277 ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 278 279 offset += nz; 280 AjLo[offset] = (PetscInt) i; 281 AALo[offset] = (MatScalar) 1.0; 282 offset += 1; 283 284 v += nz; 285 vi += nz; 286 } 287 288 /* allocate space for the triangular factor information */ 289 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 290 291 /* Create the matrix description */ 292 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 293 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 294 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 295 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat); 296 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 297 298 /* Create the solve analysis information */ 299 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 300 301 /* set the operation */ 302 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 303 304 /* set the matrix */ 305 loTriFactor->csrMat = new CsrMatrix; 306 loTriFactor->csrMat->num_rows = n; 307 loTriFactor->csrMat->num_cols = n; 308 loTriFactor->csrMat->num_entries = nzLower; 309 310 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 311 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 312 313 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 314 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 315 316 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 317 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 318 319 /* perform the solve analysis */ 320 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 321 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 322 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 323 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 324 325 /* assign the pointer. Is this really necessary? */ 326 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 327 328 ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr); 329 ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr); 330 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 331 } catch(char *ex) { 332 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 333 } 334 } 335 PetscFunctionReturn(0); 336 } 337 338 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 339 { 340 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 341 PetscInt n = A->rmap->n; 342 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 343 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 344 cusparseStatus_t stat; 345 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 346 const MatScalar *aa = a->a,*v; 347 PetscInt *AiUp, *AjUp; 348 PetscScalar *AAUp; 349 PetscInt i,nz, nzUpper, offset; 350 PetscErrorCode ierr; 351 352 PetscFunctionBegin; 353 if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) { 354 try { 355 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 356 nzUpper = adiag[0]-adiag[n]; 357 358 /* Allocate Space for the upper triangular matrix */ 359 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 360 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 361 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 362 363 /* Fill the upper triangular matrix */ 364 AiUp[0]=(PetscInt) 0; 365 AiUp[n]=nzUpper; 366 offset = nzUpper; 367 for (i=n-1; i>=0; i--) { 368 v = aa + adiag[i+1] + 1; 369 vi = aj + adiag[i+1] + 1; 370 371 /* number of elements NOT on the diagonal */ 372 nz = adiag[i] - adiag[i+1]-1; 373 374 /* decrement the offset */ 375 offset -= (nz+1); 376 377 /* first, set the diagonal elements */ 378 AjUp[offset] = (PetscInt) i; 379 AAUp[offset] = (MatScalar)1./v[nz]; 380 AiUp[i] = AiUp[i+1] - (nz+1); 381 382 ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 383 ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 384 } 385 386 /* allocate space for the triangular factor information */ 387 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 388 389 /* Create the matrix description */ 390 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 391 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 392 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 393 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 394 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 395 396 /* Create the solve analysis information */ 397 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 398 399 /* set the operation */ 400 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 401 402 /* set the matrix */ 403 upTriFactor->csrMat = new CsrMatrix; 404 upTriFactor->csrMat->num_rows = n; 405 upTriFactor->csrMat->num_cols = n; 406 upTriFactor->csrMat->num_entries = nzUpper; 407 408 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 409 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 410 411 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 412 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 413 414 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 415 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 416 417 /* perform the solve analysis */ 418 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 419 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 420 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 421 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 422 423 /* assign the pointer. Is this really necessary? */ 424 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 425 426 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 427 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 428 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 429 } catch(char *ex) { 430 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 431 } 432 } 433 PetscFunctionReturn(0); 434 } 435 436 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 437 { 438 PetscErrorCode ierr; 439 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 440 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 441 IS isrow = a->row,iscol = a->icol; 442 PetscBool row_identity,col_identity; 443 const PetscInt *r,*c; 444 PetscInt n = A->rmap->n; 445 446 PetscFunctionBegin; 447 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 448 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 449 450 cusparseTriFactors->workVector = new THRUSTARRAY; 451 cusparseTriFactors->workVector->resize(n); 452 cusparseTriFactors->nnz=a->nz; 453 454 A->valid_GPU_matrix = PETSC_CUDA_BOTH; 455 /*lower triangular indices */ 456 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 457 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 458 if (!row_identity) { 459 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 460 cusparseTriFactors->rpermIndices->assign(r, r+n); 461 } 462 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 463 464 /*upper triangular indices */ 465 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 466 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 467 if (!col_identity) { 468 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 469 cusparseTriFactors->cpermIndices->assign(c, c+n); 470 } 471 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 472 PetscFunctionReturn(0); 473 } 474 475 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 476 { 477 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 478 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 479 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 480 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 481 cusparseStatus_t stat; 482 PetscErrorCode ierr; 483 PetscInt *AiUp, *AjUp; 484 PetscScalar *AAUp; 485 PetscScalar *AALo; 486 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 487 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 488 const PetscInt *ai = b->i,*aj = b->j,*vj; 489 const MatScalar *aa = b->a,*v; 490 491 PetscFunctionBegin; 492 if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) { 493 try { 494 /* Allocate Space for the upper triangular matrix */ 495 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 496 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 497 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 498 ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 499 500 /* Fill the upper triangular matrix */ 501 AiUp[0]=(PetscInt) 0; 502 AiUp[n]=nzUpper; 503 offset = 0; 504 for (i=0; i<n; i++) { 505 /* set the pointers */ 506 v = aa + ai[i]; 507 vj = aj + ai[i]; 508 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 509 510 /* first, set the diagonal elements */ 511 AjUp[offset] = (PetscInt) i; 512 AAUp[offset] = (MatScalar)1.0/v[nz]; 513 AiUp[i] = offset; 514 AALo[offset] = (MatScalar)1.0/v[nz]; 515 516 offset+=1; 517 if (nz>0) { 518 ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr); 519 ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 520 for (j=offset; j<offset+nz; j++) { 521 AAUp[j] = -AAUp[j]; 522 AALo[j] = AAUp[j]/v[nz]; 523 } 524 offset+=nz; 525 } 526 } 527 528 /* allocate space for the triangular factor information */ 529 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 530 531 /* Create the matrix description */ 532 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 533 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 534 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 535 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 536 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 537 538 /* Create the solve analysis information */ 539 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 540 541 /* set the operation */ 542 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 543 544 /* set the matrix */ 545 upTriFactor->csrMat = new CsrMatrix; 546 upTriFactor->csrMat->num_rows = A->rmap->n; 547 upTriFactor->csrMat->num_cols = A->cmap->n; 548 upTriFactor->csrMat->num_entries = a->nz; 549 550 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 551 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 552 553 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 554 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 555 556 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 557 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 558 559 /* perform the solve analysis */ 560 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 561 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 562 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 563 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 564 565 /* assign the pointer. Is this really necessary? */ 566 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 567 568 /* allocate space for the triangular factor information */ 569 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 570 571 /* Create the matrix description */ 572 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 573 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 574 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 575 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 576 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 577 578 /* Create the solve analysis information */ 579 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 580 581 /* set the operation */ 582 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 583 584 /* set the matrix */ 585 loTriFactor->csrMat = new CsrMatrix; 586 loTriFactor->csrMat->num_rows = A->rmap->n; 587 loTriFactor->csrMat->num_cols = A->cmap->n; 588 loTriFactor->csrMat->num_entries = a->nz; 589 590 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 591 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 592 593 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 594 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 595 596 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 597 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 598 599 /* perform the solve analysis */ 600 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 601 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 602 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 603 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 604 605 /* assign the pointer. Is this really necessary? */ 606 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 607 608 A->valid_GPU_matrix = PETSC_CUDA_BOTH; 609 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 610 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 611 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 612 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 613 } catch(char *ex) { 614 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 615 } 616 } 617 PetscFunctionReturn(0); 618 } 619 620 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 621 { 622 PetscErrorCode ierr; 623 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 624 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 625 IS ip = a->row; 626 const PetscInt *rip; 627 PetscBool perm_identity; 628 PetscInt n = A->rmap->n; 629 630 PetscFunctionBegin; 631 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 632 cusparseTriFactors->workVector = new THRUSTARRAY; 633 cusparseTriFactors->workVector->resize(n); 634 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 635 636 /*lower triangular indices */ 637 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 638 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 639 if (!perm_identity) { 640 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 641 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 642 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 643 cusparseTriFactors->cpermIndices->assign(rip, rip+n); 644 } 645 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 646 PetscFunctionReturn(0); 647 } 648 649 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 650 { 651 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 652 IS isrow = b->row,iscol = b->col; 653 PetscBool row_identity,col_identity; 654 PetscErrorCode ierr; 655 656 PetscFunctionBegin; 657 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 658 /* determine which version of MatSolve needs to be used. */ 659 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 660 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 661 if (row_identity && col_identity) { 662 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 663 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 664 } else { 665 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 666 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 667 } 668 669 /* get the triangular factors */ 670 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 671 PetscFunctionReturn(0); 672 } 673 674 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 675 { 676 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 677 IS ip = b->row; 678 PetscBool perm_identity; 679 PetscErrorCode ierr; 680 681 PetscFunctionBegin; 682 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 683 684 /* determine which version of MatSolve needs to be used. */ 685 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 686 if (perm_identity) { 687 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 688 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 689 } else { 690 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 691 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 692 } 693 694 /* get the triangular factors */ 695 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 696 PetscFunctionReturn(0); 697 } 698 699 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 700 { 701 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 702 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 703 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 704 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 705 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 706 cusparseStatus_t stat; 707 cusparseIndexBase_t indexBase; 708 cusparseMatrixType_t matrixType; 709 cusparseFillMode_t fillMode; 710 cusparseDiagType_t diagType; 711 712 PetscFunctionBegin; 713 714 /*********************************************/ 715 /* Now the Transpose of the Lower Tri Factor */ 716 /*********************************************/ 717 718 /* allocate space for the transpose of the lower triangular factor */ 719 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 720 721 /* set the matrix descriptors of the lower triangular factor */ 722 matrixType = cusparseGetMatType(loTriFactor->descr); 723 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 724 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 725 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 726 diagType = cusparseGetMatDiagType(loTriFactor->descr); 727 728 /* Create the matrix description */ 729 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat); 730 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat); 731 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat); 732 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat); 733 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat); 734 735 /* Create the solve analysis information */ 736 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat); 737 738 /* set the operation */ 739 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 740 741 /* allocate GPU space for the CSC of the lower triangular factor*/ 742 loTriFactorT->csrMat = new CsrMatrix; 743 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 744 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 745 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 746 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 747 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 748 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 749 750 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 751 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 752 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 753 loTriFactor->csrMat->values->data().get(), 754 loTriFactor->csrMat->row_offsets->data().get(), 755 loTriFactor->csrMat->column_indices->data().get(), 756 loTriFactorT->csrMat->values->data().get(), 757 loTriFactorT->csrMat->column_indices->data().get(), 758 loTriFactorT->csrMat->row_offsets->data().get(), 759 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 760 761 /* perform the solve analysis on the transposed matrix */ 762 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 763 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 764 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 765 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 766 loTriFactorT->solveInfo);CHKERRCUDA(stat); 767 768 /* assign the pointer. Is this really necessary? */ 769 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 770 771 /*********************************************/ 772 /* Now the Transpose of the Upper Tri Factor */ 773 /*********************************************/ 774 775 /* allocate space for the transpose of the upper triangular factor */ 776 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 777 778 /* set the matrix descriptors of the upper triangular factor */ 779 matrixType = cusparseGetMatType(upTriFactor->descr); 780 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 781 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 782 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 783 diagType = cusparseGetMatDiagType(upTriFactor->descr); 784 785 /* Create the matrix description */ 786 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat); 787 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat); 788 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat); 789 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat); 790 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat); 791 792 /* Create the solve analysis information */ 793 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat); 794 795 /* set the operation */ 796 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 797 798 /* allocate GPU space for the CSC of the upper triangular factor*/ 799 upTriFactorT->csrMat = new CsrMatrix; 800 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 801 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 802 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 803 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 804 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 805 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 806 807 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 808 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 809 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 810 upTriFactor->csrMat->values->data().get(), 811 upTriFactor->csrMat->row_offsets->data().get(), 812 upTriFactor->csrMat->column_indices->data().get(), 813 upTriFactorT->csrMat->values->data().get(), 814 upTriFactorT->csrMat->column_indices->data().get(), 815 upTriFactorT->csrMat->row_offsets->data().get(), 816 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 817 818 /* perform the solve analysis on the transposed matrix */ 819 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 820 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 821 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 822 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 823 upTriFactorT->solveInfo);CHKERRCUDA(stat); 824 825 /* assign the pointer. Is this really necessary? */ 826 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 827 PetscFunctionReturn(0); 828 } 829 830 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 831 { 832 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 833 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 834 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 835 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 836 cusparseStatus_t stat; 837 cusparseIndexBase_t indexBase; 838 cudaError_t err; 839 840 PetscFunctionBegin; 841 842 /* allocate space for the triangular factor information */ 843 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 844 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat); 845 indexBase = cusparseGetMatIndexBase(matstruct->descr); 846 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat); 847 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 848 849 /* set alpha and beta */ 850 err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUDA(err); 851 err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 852 err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUDA(err); 853 err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 854 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 855 856 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 857 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 858 CsrMatrix *matrixT= new CsrMatrix; 859 matrixT->num_rows = A->rmap->n; 860 matrixT->num_cols = A->cmap->n; 861 matrixT->num_entries = a->nz; 862 matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 863 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 864 matrixT->values = new THRUSTARRAY(a->nz); 865 866 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 867 indexBase = cusparseGetMatIndexBase(matstruct->descr); 868 stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows, 869 matrix->num_cols, matrix->num_entries, 870 matrix->values->data().get(), 871 matrix->row_offsets->data().get(), 872 matrix->column_indices->data().get(), 873 matrixT->values->data().get(), 874 matrixT->column_indices->data().get(), 875 matrixT->row_offsets->data().get(), 876 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 877 878 /* assign the pointer */ 879 matstructT->mat = matrixT; 880 881 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 882 #if CUDA_VERSION>=5000 883 /* First convert HYB to CSR */ 884 CsrMatrix *temp= new CsrMatrix; 885 temp->num_rows = A->rmap->n; 886 temp->num_cols = A->cmap->n; 887 temp->num_entries = a->nz; 888 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 889 temp->column_indices = new THRUSTINTARRAY32(a->nz); 890 temp->values = new THRUSTARRAY(a->nz); 891 892 893 stat = cusparse_hyb2csr(cusparsestruct->handle, 894 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 895 temp->values->data().get(), 896 temp->row_offsets->data().get(), 897 temp->column_indices->data().get());CHKERRCUDA(stat); 898 899 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 900 CsrMatrix *tempT= new CsrMatrix; 901 tempT->num_rows = A->rmap->n; 902 tempT->num_cols = A->cmap->n; 903 tempT->num_entries = a->nz; 904 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 905 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 906 tempT->values = new THRUSTARRAY(a->nz); 907 908 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 909 temp->num_cols, temp->num_entries, 910 temp->values->data().get(), 911 temp->row_offsets->data().get(), 912 temp->column_indices->data().get(), 913 tempT->values->data().get(), 914 tempT->column_indices->data().get(), 915 tempT->row_offsets->data().get(), 916 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 917 918 /* Last, convert CSC to HYB */ 919 cusparseHybMat_t hybMat; 920 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 921 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 922 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 923 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 924 matstructT->descr, tempT->values->data().get(), 925 tempT->row_offsets->data().get(), 926 tempT->column_indices->data().get(), 927 hybMat, 0, partition);CHKERRCUDA(stat); 928 929 /* assign the pointer */ 930 matstructT->mat = hybMat; 931 932 /* delete temporaries */ 933 if (tempT) { 934 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 935 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 936 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 937 delete (CsrMatrix*) tempT; 938 } 939 if (temp) { 940 if (temp->values) delete (THRUSTARRAY*) temp->values; 941 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 942 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 943 delete (CsrMatrix*) temp; 944 } 945 #else 946 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later."); 947 #endif 948 } 949 /* assign the compressed row indices */ 950 matstructT->cprowIndices = new THRUSTINTARRAY; 951 952 /* assign the pointer */ 953 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 954 PetscFunctionReturn(0); 955 } 956 957 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 958 { 959 PetscInt n = xx->map->n; 960 const PetscScalar *barray; 961 PetscScalar *xarray; 962 thrust::device_ptr<const PetscScalar> bGPU; 963 thrust::device_ptr<PetscScalar> xGPU; 964 cusparseStatus_t stat; 965 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 966 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 967 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 968 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 969 PetscErrorCode ierr; 970 971 PetscFunctionBegin; 972 /* Analyze the matrix and create the transpose ... on the fly */ 973 if (!loTriFactorT && !upTriFactorT) { 974 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 975 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 976 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 977 } 978 979 /* Get the GPU pointers */ 980 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 981 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 982 xGPU = thrust::device_pointer_cast(xarray); 983 bGPU = thrust::device_pointer_cast(barray); 984 985 /* First, reorder with the row permutation */ 986 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 987 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 988 xGPU); 989 990 /* First, solve U */ 991 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 992 upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 993 upTriFactorT->csrMat->values->data().get(), 994 upTriFactorT->csrMat->row_offsets->data().get(), 995 upTriFactorT->csrMat->column_indices->data().get(), 996 upTriFactorT->solveInfo, 997 xarray, tempGPU->data().get());CHKERRCUDA(stat); 998 999 /* Then, solve L */ 1000 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1001 loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 1002 loTriFactorT->csrMat->values->data().get(), 1003 loTriFactorT->csrMat->row_offsets->data().get(), 1004 loTriFactorT->csrMat->column_indices->data().get(), 1005 loTriFactorT->solveInfo, 1006 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1007 1008 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1009 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1010 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1011 tempGPU->begin()); 1012 1013 /* Copy the temporary to the full solution. */ 1014 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1015 1016 /* restore */ 1017 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1018 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1019 ierr = WaitForGPU();CHKERRCUDA(ierr); 1020 1021 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1022 PetscFunctionReturn(0); 1023 } 1024 1025 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1026 { 1027 const PetscScalar *barray; 1028 PetscScalar *xarray; 1029 cusparseStatus_t stat; 1030 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1031 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1032 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1033 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1034 PetscErrorCode ierr; 1035 1036 PetscFunctionBegin; 1037 /* Analyze the matrix and create the transpose ... on the fly */ 1038 if (!loTriFactorT && !upTriFactorT) { 1039 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1040 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1041 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1042 } 1043 1044 /* Get the GPU pointers */ 1045 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1046 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1047 1048 /* First, solve U */ 1049 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1050 upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 1051 upTriFactorT->csrMat->values->data().get(), 1052 upTriFactorT->csrMat->row_offsets->data().get(), 1053 upTriFactorT->csrMat->column_indices->data().get(), 1054 upTriFactorT->solveInfo, 1055 barray, tempGPU->data().get());CHKERRCUDA(stat); 1056 1057 /* Then, solve L */ 1058 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1059 loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 1060 loTriFactorT->csrMat->values->data().get(), 1061 loTriFactorT->csrMat->row_offsets->data().get(), 1062 loTriFactorT->csrMat->column_indices->data().get(), 1063 loTriFactorT->solveInfo, 1064 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1065 1066 /* restore */ 1067 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1068 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1069 ierr = WaitForGPU();CHKERRCUDA(ierr); 1070 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1071 PetscFunctionReturn(0); 1072 } 1073 1074 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1075 { 1076 const PetscScalar *barray; 1077 PetscScalar *xarray; 1078 thrust::device_ptr<const PetscScalar> bGPU; 1079 thrust::device_ptr<PetscScalar> xGPU; 1080 cusparseStatus_t stat; 1081 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1082 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1083 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1084 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1085 PetscErrorCode ierr; 1086 VecType t; 1087 PetscBool flg; 1088 1089 PetscFunctionBegin; 1090 ierr = VecGetType(bb,&t);CHKERRQ(ierr); 1091 ierr = PetscStrcmp(t,VECSEQCUDA,&flg);CHKERRQ(ierr); 1092 if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #2). Can only deal with %s\n.",t,VECSEQCUDA); 1093 ierr = VecGetType(xx,&t);CHKERRQ(ierr); 1094 ierr = PetscStrcmp(t,VECSEQCUDA,&flg);CHKERRQ(ierr); 1095 if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #3). Can only deal with %s\n.",t,VECSEQCUDA); 1096 1097 /* Get the GPU pointers */ 1098 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1099 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1100 xGPU = thrust::device_pointer_cast(xarray); 1101 bGPU = thrust::device_pointer_cast(barray); 1102 1103 /* First, reorder with the row permutation */ 1104 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1105 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1106 xGPU); 1107 1108 /* Next, solve L */ 1109 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1110 loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1111 loTriFactor->csrMat->values->data().get(), 1112 loTriFactor->csrMat->row_offsets->data().get(), 1113 loTriFactor->csrMat->column_indices->data().get(), 1114 loTriFactor->solveInfo, 1115 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1116 1117 /* Then, solve U */ 1118 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1119 upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1120 upTriFactor->csrMat->values->data().get(), 1121 upTriFactor->csrMat->row_offsets->data().get(), 1122 upTriFactor->csrMat->column_indices->data().get(), 1123 upTriFactor->solveInfo, 1124 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1125 1126 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1127 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1128 thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()), 1129 tempGPU->begin()); 1130 1131 /* Copy the temporary to the full solution. */ 1132 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1133 1134 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1135 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1136 ierr = WaitForGPU();CHKERRCUDA(ierr); 1137 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1138 PetscFunctionReturn(0); 1139 } 1140 1141 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1142 { 1143 const PetscScalar *barray; 1144 PetscScalar *xarray; 1145 cusparseStatus_t stat; 1146 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1147 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1148 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1149 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1150 PetscErrorCode ierr; 1151 1152 PetscFunctionBegin; 1153 /* Get the GPU pointers */ 1154 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1155 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1156 1157 /* First, solve L */ 1158 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1159 loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1160 loTriFactor->csrMat->values->data().get(), 1161 loTriFactor->csrMat->row_offsets->data().get(), 1162 loTriFactor->csrMat->column_indices->data().get(), 1163 loTriFactor->solveInfo, 1164 barray, tempGPU->data().get());CHKERRCUDA(stat); 1165 1166 /* Next, solve U */ 1167 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1168 upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1169 upTriFactor->csrMat->values->data().get(), 1170 upTriFactor->csrMat->row_offsets->data().get(), 1171 upTriFactor->csrMat->column_indices->data().get(), 1172 upTriFactor->solveInfo, 1173 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1174 1175 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1176 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1177 ierr = WaitForGPU();CHKERRCUDA(ierr); 1178 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1179 PetscFunctionReturn(0); 1180 } 1181 1182 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1183 { 1184 1185 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1186 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1187 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1188 PetscInt m = A->rmap->n,*ii,*ridx; 1189 PetscErrorCode ierr; 1190 cusparseStatus_t stat; 1191 cudaError_t err; 1192 1193 PetscFunctionBegin; 1194 if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) { 1195 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1196 if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1197 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1198 /* copy values only */ 1199 matrix->values->assign(a->a, a->a+a->nz); 1200 } else { 1201 Mat_SeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format); 1202 try { 1203 cusparsestruct->nonzerorow=0; 1204 for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0); 1205 1206 if (a->compressedrow.use) { 1207 m = a->compressedrow.nrows; 1208 ii = a->compressedrow.i; 1209 ridx = a->compressedrow.rindex; 1210 } else { 1211 /* Forcing compressed row on the GPU */ 1212 int k=0; 1213 ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr); 1214 ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr); 1215 ii[0]=0; 1216 for (int j = 0; j<m; j++) { 1217 if ((a->i[j+1]-a->i[j])>0) { 1218 ii[k] = a->i[j]; 1219 ridx[k]= j; 1220 k++; 1221 } 1222 } 1223 ii[cusparsestruct->nonzerorow] = a->nz; 1224 m = cusparsestruct->nonzerorow; 1225 } 1226 1227 /* allocate space for the triangular factor information */ 1228 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1229 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat); 1230 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 1231 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 1232 1233 err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUDA(err); 1234 err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1235 err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUDA(err); 1236 err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1237 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 1238 1239 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1240 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1241 /* set the matrix */ 1242 CsrMatrix *matrix= new CsrMatrix; 1243 matrix->num_rows = m; 1244 matrix->num_cols = A->cmap->n; 1245 matrix->num_entries = a->nz; 1246 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1247 matrix->row_offsets->assign(ii, ii + m+1); 1248 1249 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1250 matrix->column_indices->assign(a->j, a->j+a->nz); 1251 1252 matrix->values = new THRUSTARRAY(a->nz); 1253 matrix->values->assign(a->a, a->a+a->nz); 1254 1255 /* assign the pointer */ 1256 matstruct->mat = matrix; 1257 1258 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1259 #if CUDA_VERSION>=4020 1260 CsrMatrix *matrix= new CsrMatrix; 1261 matrix->num_rows = m; 1262 matrix->num_cols = A->cmap->n; 1263 matrix->num_entries = a->nz; 1264 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1265 matrix->row_offsets->assign(ii, ii + m+1); 1266 1267 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1268 matrix->column_indices->assign(a->j, a->j+a->nz); 1269 1270 matrix->values = new THRUSTARRAY(a->nz); 1271 matrix->values->assign(a->a, a->a+a->nz); 1272 1273 cusparseHybMat_t hybMat; 1274 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 1275 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1276 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1277 stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1278 matstruct->descr, matrix->values->data().get(), 1279 matrix->row_offsets->data().get(), 1280 matrix->column_indices->data().get(), 1281 hybMat, 0, partition);CHKERRCUDA(stat); 1282 /* assign the pointer */ 1283 matstruct->mat = hybMat; 1284 1285 if (matrix) { 1286 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1287 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1288 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1289 delete (CsrMatrix*)matrix; 1290 } 1291 #endif 1292 } 1293 1294 /* assign the compressed row indices */ 1295 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1296 matstruct->cprowIndices->assign(ridx,ridx+m); 1297 1298 /* assign the pointer */ 1299 cusparsestruct->mat = matstruct; 1300 1301 if (!a->compressedrow.use) { 1302 ierr = PetscFree(ii);CHKERRQ(ierr); 1303 ierr = PetscFree(ridx);CHKERRQ(ierr); 1304 } 1305 cusparsestruct->workVector = new THRUSTARRAY; 1306 cusparsestruct->workVector->resize(m); 1307 } catch(char *ex) { 1308 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1309 } 1310 cusparsestruct->nonzerostate = A->nonzerostate; 1311 } 1312 ierr = WaitForGPU();CHKERRCUDA(ierr); 1313 A->valid_GPU_matrix = PETSC_CUDA_BOTH; 1314 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1315 } 1316 PetscFunctionReturn(0); 1317 } 1318 1319 static PetscErrorCode MatCreateVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left) 1320 { 1321 PetscErrorCode ierr; 1322 PetscInt rbs,cbs; 1323 1324 PetscFunctionBegin; 1325 ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr); 1326 if (right) { 1327 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 1328 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 1329 ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr); 1330 ierr = VecSetType(*right,VECSEQCUDA);CHKERRQ(ierr); 1331 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 1332 } 1333 if (left) { 1334 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 1335 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 1336 ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr); 1337 ierr = VecSetType(*left,VECSEQCUDA);CHKERRQ(ierr); 1338 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 1339 } 1340 PetscFunctionReturn(0); 1341 } 1342 1343 struct VecCUDAPlusEquals 1344 { 1345 template <typename Tuple> 1346 __host__ __device__ 1347 void operator()(Tuple t) 1348 { 1349 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1350 } 1351 }; 1352 1353 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1354 { 1355 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1356 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1357 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1358 const PetscScalar *xarray; 1359 PetscScalar *yarray; 1360 PetscErrorCode ierr; 1361 cusparseStatus_t stat; 1362 1363 PetscFunctionBegin; 1364 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1365 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1366 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1367 ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1368 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1369 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1370 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1371 mat->num_rows, mat->num_cols, mat->num_entries, 1372 matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), 1373 mat->column_indices->data().get(), xarray, matstruct->beta, 1374 yarray);CHKERRCUDA(stat); 1375 } else { 1376 #if CUDA_VERSION>=4020 1377 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1378 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1379 matstruct->alpha, matstruct->descr, hybMat, 1380 xarray, matstruct->beta, 1381 yarray);CHKERRCUDA(stat); 1382 #endif 1383 } 1384 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1385 ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1386 if (!cusparsestruct->stream) { 1387 ierr = WaitForGPU();CHKERRCUDA(ierr); 1388 } 1389 ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1390 PetscFunctionReturn(0); 1391 } 1392 1393 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1394 { 1395 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1396 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1397 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1398 const PetscScalar *xarray; 1399 PetscScalar *yarray; 1400 PetscErrorCode ierr; 1401 cusparseStatus_t stat; 1402 1403 PetscFunctionBegin; 1404 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1405 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1406 if (!matstructT) { 1407 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1408 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1409 } 1410 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1411 ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1412 1413 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1414 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1415 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1416 mat->num_rows, mat->num_cols, 1417 mat->num_entries, matstructT->alpha, matstructT->descr, 1418 mat->values->data().get(), mat->row_offsets->data().get(), 1419 mat->column_indices->data().get(), xarray, matstructT->beta, 1420 yarray);CHKERRCUDA(stat); 1421 } else { 1422 #if CUDA_VERSION>=4020 1423 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1424 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1425 matstructT->alpha, matstructT->descr, hybMat, 1426 xarray, matstructT->beta, 1427 yarray);CHKERRCUDA(stat); 1428 #endif 1429 } 1430 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1431 ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1432 if (!cusparsestruct->stream) { 1433 ierr = WaitForGPU();CHKERRCUDA(ierr); 1434 } 1435 ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1436 PetscFunctionReturn(0); 1437 } 1438 1439 1440 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1441 { 1442 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1443 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1444 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1445 thrust::device_ptr<PetscScalar> zptr; 1446 const PetscScalar *xarray; 1447 PetscScalar *zarray; 1448 PetscErrorCode ierr; 1449 cusparseStatus_t stat; 1450 1451 PetscFunctionBegin; 1452 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1453 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1454 try { 1455 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1456 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1457 ierr = VecCUDAGetArrayReadWrite(zz,&zarray);CHKERRQ(ierr); 1458 zptr = thrust::device_pointer_cast(zarray); 1459 1460 /* multiply add */ 1461 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1462 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1463 /* here we need to be careful to set the number of rows in the multiply to the 1464 number of compressed rows in the matrix ... which is equivalent to the 1465 size of the workVector */ 1466 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1467 mat->num_rows, mat->num_cols, 1468 mat->num_entries, matstruct->alpha, matstruct->descr, 1469 mat->values->data().get(), mat->row_offsets->data().get(), 1470 mat->column_indices->data().get(), xarray, matstruct->beta, 1471 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1472 } else { 1473 #if CUDA_VERSION>=4020 1474 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1475 if (cusparsestruct->workVector->size()) { 1476 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1477 matstruct->alpha, matstruct->descr, hybMat, 1478 xarray, matstruct->beta, 1479 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1480 } 1481 #endif 1482 } 1483 1484 /* scatter the data from the temporary into the full vector with a += operation */ 1485 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 1486 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1487 VecCUDAPlusEquals()); 1488 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1489 ierr = VecCUDARestoreArrayReadWrite(zz,&zarray);CHKERRQ(ierr); 1490 1491 } catch(char *ex) { 1492 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1493 } 1494 ierr = WaitForGPU();CHKERRCUDA(ierr); 1495 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1496 PetscFunctionReturn(0); 1497 } 1498 1499 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1500 { 1501 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1502 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1503 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1504 thrust::device_ptr<PetscScalar> zptr; 1505 const PetscScalar *xarray; 1506 PetscScalar *zarray; 1507 PetscErrorCode ierr; 1508 cusparseStatus_t stat; 1509 1510 PetscFunctionBegin; 1511 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1512 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1513 if (!matstructT) { 1514 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1515 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1516 } 1517 1518 try { 1519 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1520 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1521 ierr = VecCUDAGetArrayReadWrite(zz,&zarray);CHKERRQ(ierr); 1522 zptr = thrust::device_pointer_cast(zarray); 1523 1524 /* multiply add with matrix transpose */ 1525 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1526 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1527 /* here we need to be careful to set the number of rows in the multiply to the 1528 number of compressed rows in the matrix ... which is equivalent to the 1529 size of the workVector */ 1530 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1531 mat->num_rows, mat->num_cols, 1532 mat->num_entries, matstructT->alpha, matstructT->descr, 1533 mat->values->data().get(), mat->row_offsets->data().get(), 1534 mat->column_indices->data().get(), xarray, matstructT->beta, 1535 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1536 } else { 1537 #if CUDA_VERSION>=4020 1538 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1539 if (cusparsestruct->workVector->size()) { 1540 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1541 matstructT->alpha, matstructT->descr, hybMat, 1542 xarray, matstructT->beta, 1543 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1544 } 1545 #endif 1546 } 1547 1548 /* scatter the data from the temporary into the full vector with a += operation */ 1549 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))), 1550 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1551 VecCUDAPlusEquals()); 1552 1553 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1554 ierr = VecCUDARestoreArrayReadWrite(zz,&zarray);CHKERRQ(ierr); 1555 1556 } catch(char *ex) { 1557 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1558 } 1559 ierr = WaitForGPU();CHKERRCUDA(ierr); 1560 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1561 PetscFunctionReturn(0); 1562 } 1563 1564 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1565 { 1566 PetscErrorCode ierr; 1567 1568 PetscFunctionBegin; 1569 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1570 if (A->factortype==MAT_FACTOR_NONE) { 1571 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1572 } 1573 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1574 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1575 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1576 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1577 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1578 PetscFunctionReturn(0); 1579 } 1580 1581 /* --------------------------------------------------------------------------------*/ 1582 /*@ 1583 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1584 (the default parallel PETSc format). This matrix will ultimately pushed down 1585 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1586 assembly performance the user should preallocate the matrix storage by setting 1587 the parameter nz (or the array nnz). By setting these parameters accurately, 1588 performance during matrix assembly can be increased by more than a factor of 50. 1589 1590 Collective on MPI_Comm 1591 1592 Input Parameters: 1593 + comm - MPI communicator, set to PETSC_COMM_SELF 1594 . m - number of rows 1595 . n - number of columns 1596 . nz - number of nonzeros per row (same for all rows) 1597 - nnz - array containing the number of nonzeros in the various rows 1598 (possibly different for each row) or NULL 1599 1600 Output Parameter: 1601 . A - the matrix 1602 1603 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1604 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1605 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1606 1607 Notes: 1608 If nnz is given then nz is ignored 1609 1610 The AIJ format (also called the Yale sparse matrix format or 1611 compressed row storage), is fully compatible with standard Fortran 77 1612 storage. That is, the stored row and column indices can begin at 1613 either one (as in Fortran) or zero. See the users' manual for details. 1614 1615 Specify the preallocated storage with either nz or nnz (not both). 1616 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1617 allocation. For large problems you MUST preallocate memory or you 1618 will get TERRIBLE performance, see the users' manual chapter on matrices. 1619 1620 By default, this format uses inodes (identical nodes) when possible, to 1621 improve numerical efficiency of matrix-vector products and solves. We 1622 search for consecutive rows with the same nonzero structure, thereby 1623 reusing matrix information to achieve increased efficiency. 1624 1625 Level: intermediate 1626 1627 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1628 @*/ 1629 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1630 { 1631 PetscErrorCode ierr; 1632 1633 PetscFunctionBegin; 1634 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1635 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1636 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1637 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1638 PetscFunctionReturn(0); 1639 } 1640 1641 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1642 { 1643 PetscErrorCode ierr; 1644 1645 PetscFunctionBegin; 1646 if (A->factortype==MAT_FACTOR_NONE) { 1647 if (A->valid_GPU_matrix != PETSC_CUDA_UNALLOCATED) { 1648 ierr = Mat_SeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 1649 } 1650 } else { 1651 ierr = Mat_SeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 1652 } 1653 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1654 PetscFunctionReturn(0); 1655 } 1656 1657 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 1658 { 1659 PetscErrorCode ierr; 1660 cusparseStatus_t stat; 1661 cusparseHandle_t handle=0; 1662 1663 PetscFunctionBegin; 1664 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 1665 if (B->factortype==MAT_FACTOR_NONE) { 1666 /* you cannot check the inode.use flag here since the matrix was just created. 1667 now build a GPU matrix data structure */ 1668 B->spptr = new Mat_SeqAIJCUSPARSE; 1669 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1670 ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0; 1671 ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0; 1672 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1673 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1674 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = 0; 1675 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1676 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle; 1677 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1678 } else { 1679 /* NEXT, set the pointers to the triangular factors */ 1680 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1681 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 1682 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1683 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0; 1684 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0; 1685 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0; 1686 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0; 1687 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0; 1688 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = 0; 1689 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1690 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle; 1691 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0; 1692 } 1693 1694 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1695 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1696 B->ops->getvecs = MatCreateVecs_SeqAIJCUSPARSE; 1697 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1698 B->ops->mult = MatMult_SeqAIJCUSPARSE; 1699 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1700 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1701 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1702 1703 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1704 1705 B->valid_GPU_matrix = PETSC_CUDA_UNALLOCATED; 1706 1707 ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1708 PetscFunctionReturn(0); 1709 } 1710 1711 /*M 1712 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1713 1714 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1715 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 1716 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 1717 1718 Options Database Keys: 1719 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1720 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1721 . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1722 1723 Level: beginner 1724 1725 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1726 M*/ 1727 1728 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 1729 1730 1731 PETSC_EXTERN PetscErrorCode MatSolverPackageRegister_CUSPARSE(void) 1732 { 1733 PetscErrorCode ierr; 1734 1735 PetscFunctionBegin; 1736 ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1737 ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1738 ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1739 ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1740 PetscFunctionReturn(0); 1741 } 1742 1743 1744 static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 1745 { 1746 cusparseStatus_t stat; 1747 cusparseHandle_t handle; 1748 1749 PetscFunctionBegin; 1750 if (*cusparsestruct) { 1751 Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format); 1752 Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format); 1753 delete (*cusparsestruct)->workVector; 1754 if (handle = (*cusparsestruct)->handle) { 1755 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1756 } 1757 delete *cusparsestruct; 1758 *cusparsestruct = 0; 1759 } 1760 PetscFunctionReturn(0); 1761 } 1762 1763 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 1764 { 1765 PetscFunctionBegin; 1766 if (*mat) { 1767 delete (*mat)->values; 1768 delete (*mat)->column_indices; 1769 delete (*mat)->row_offsets; 1770 delete *mat; 1771 *mat = 0; 1772 } 1773 PetscFunctionReturn(0); 1774 } 1775 1776 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 1777 { 1778 cusparseStatus_t stat; 1779 PetscErrorCode ierr; 1780 1781 PetscFunctionBegin; 1782 if (*trifactor) { 1783 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); } 1784 if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); } 1785 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 1786 delete *trifactor; 1787 *trifactor = 0; 1788 } 1789 PetscFunctionReturn(0); 1790 } 1791 1792 static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 1793 { 1794 CsrMatrix *mat; 1795 cusparseStatus_t stat; 1796 cudaError_t err; 1797 1798 PetscFunctionBegin; 1799 if (*matstruct) { 1800 if ((*matstruct)->mat) { 1801 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 1802 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 1803 stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat); 1804 } else { 1805 mat = (CsrMatrix*)(*matstruct)->mat; 1806 CsrMatrix_Destroy(&mat); 1807 } 1808 } 1809 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); } 1810 delete (*matstruct)->cprowIndices; 1811 if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); } 1812 if ((*matstruct)->beta) { err=cudaFree((*matstruct)->beta);CHKERRCUDA(err); } 1813 delete *matstruct; 1814 *matstruct = 0; 1815 } 1816 PetscFunctionReturn(0); 1817 } 1818 1819 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 1820 { 1821 cusparseHandle_t handle; 1822 cusparseStatus_t stat; 1823 1824 PetscFunctionBegin; 1825 if (*trifactors) { 1826 Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtr); 1827 Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtr); 1828 Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose); 1829 Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose); 1830 delete (*trifactors)->rpermIndices; 1831 delete (*trifactors)->cpermIndices; 1832 delete (*trifactors)->workVector; 1833 if (handle = (*trifactors)->handle) { 1834 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1835 } 1836 delete *trifactors; 1837 *trifactors = 0; 1838 } 1839 PetscFunctionReturn(0); 1840 } 1841 1842