1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 7 #include <petscconf.h> 8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 9 #include <../src/mat/impls/sbaij/seq/sbaij.h> 10 #include <../src/vec/vec/impls/dvecimpl.h> 11 #include <petsc/private/vecimpl.h> 12 #undef VecType 13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 14 15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 16 17 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 18 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 19 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 20 21 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 22 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 23 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 24 25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 29 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 30 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 31 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 32 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 33 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 34 35 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 36 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 37 static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 38 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 39 static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 40 41 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 42 { 43 cusparseStatus_t stat; 44 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 45 46 PetscFunctionBegin; 47 cusparsestruct->stream = stream; 48 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat); 49 PetscFunctionReturn(0); 50 } 51 52 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 53 { 54 cusparseStatus_t stat; 55 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 56 57 PetscFunctionBegin; 58 if (cusparsestruct->handle) { 59 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat); 60 } 61 cusparsestruct->handle = handle; 62 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 63 PetscFunctionReturn(0); 64 } 65 66 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 67 { 68 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 69 PetscFunctionBegin; 70 if (cusparsestruct->handle) 71 cusparsestruct->handle = 0; 72 PetscFunctionReturn(0); 73 } 74 75 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type) 76 { 77 PetscFunctionBegin; 78 *type = MATSOLVERCUSPARSE; 79 PetscFunctionReturn(0); 80 } 81 82 /*MC 83 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 84 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 85 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 86 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 87 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 88 algorithms are not recommended. This class does NOT support direct solver operations. 89 90 Level: beginner 91 92 .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 93 M*/ 94 95 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 96 { 97 PetscErrorCode ierr; 98 PetscInt n = A->rmap->n; 99 100 PetscFunctionBegin; 101 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 102 (*B)->factortype = ftype; 103 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 104 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 105 106 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 107 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 108 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 109 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 110 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 111 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 112 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 113 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 114 115 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 116 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr); 117 PetscFunctionReturn(0); 118 } 119 120 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 121 { 122 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 123 124 PetscFunctionBegin; 125 #if CUDA_VERSION>=4020 126 switch (op) { 127 case MAT_CUSPARSE_MULT: 128 cusparsestruct->format = format; 129 break; 130 case MAT_CUSPARSE_ALL: 131 cusparsestruct->format = format; 132 break; 133 default: 134 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 135 } 136 #else 137 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later."); 138 #endif 139 PetscFunctionReturn(0); 140 } 141 142 /*@ 143 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 144 operation. Only the MatMult operation can use different GPU storage formats 145 for MPIAIJCUSPARSE matrices. 146 Not Collective 147 148 Input Parameters: 149 + A - Matrix of type SEQAIJCUSPARSE 150 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 151 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 152 153 Output Parameter: 154 155 Level: intermediate 156 157 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 158 @*/ 159 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 160 { 161 PetscErrorCode ierr; 162 163 PetscFunctionBegin; 164 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 165 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 166 PetscFunctionReturn(0); 167 } 168 169 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 170 { 171 PetscErrorCode ierr; 172 MatCUSPARSEStorageFormat format; 173 PetscBool flg; 174 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 175 176 PetscFunctionBegin; 177 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 178 ierr = PetscObjectOptionsBegin((PetscObject)A); 179 if (A->factortype==MAT_FACTOR_NONE) { 180 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 181 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 182 if (flg) { 183 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 184 } 185 } 186 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 187 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 188 if (flg) { 189 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 190 } 191 ierr = PetscOptionsEnd();CHKERRQ(ierr); 192 PetscFunctionReturn(0); 193 194 } 195 196 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 197 { 198 PetscErrorCode ierr; 199 200 PetscFunctionBegin; 201 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 202 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 203 PetscFunctionReturn(0); 204 } 205 206 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 207 { 208 PetscErrorCode ierr; 209 210 PetscFunctionBegin; 211 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 212 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 213 PetscFunctionReturn(0); 214 } 215 216 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 217 { 218 PetscErrorCode ierr; 219 220 PetscFunctionBegin; 221 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 222 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 223 PetscFunctionReturn(0); 224 } 225 226 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 227 { 228 PetscErrorCode ierr; 229 230 PetscFunctionBegin; 231 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 232 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 233 PetscFunctionReturn(0); 234 } 235 236 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 237 { 238 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 239 PetscInt n = A->rmap->n; 240 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 241 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 242 cusparseStatus_t stat; 243 const PetscInt *ai = a->i,*aj = a->j,*vi; 244 const MatScalar *aa = a->a,*v; 245 PetscInt *AiLo, *AjLo; 246 PetscScalar *AALo; 247 PetscInt i,nz, nzLower, offset, rowOffset; 248 PetscErrorCode ierr; 249 250 PetscFunctionBegin; 251 if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) { 252 try { 253 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 254 nzLower=n+ai[n]-ai[1]; 255 256 /* Allocate Space for the lower triangular matrix */ 257 ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 258 ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr); 259 ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr); 260 261 /* Fill the lower triangular matrix */ 262 AiLo[0] = (PetscInt) 0; 263 AiLo[n] = nzLower; 264 AjLo[0] = (PetscInt) 0; 265 AALo[0] = (MatScalar) 1.0; 266 v = aa; 267 vi = aj; 268 offset = 1; 269 rowOffset= 1; 270 for (i=1; i<n; i++) { 271 nz = ai[i+1] - ai[i]; 272 /* additional 1 for the term on the diagonal */ 273 AiLo[i] = rowOffset; 274 rowOffset += nz+1; 275 276 ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 277 ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 278 279 offset += nz; 280 AjLo[offset] = (PetscInt) i; 281 AALo[offset] = (MatScalar) 1.0; 282 offset += 1; 283 284 v += nz; 285 vi += nz; 286 } 287 288 /* allocate space for the triangular factor information */ 289 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 290 291 /* Create the matrix description */ 292 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 293 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 294 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 295 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat); 296 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 297 298 /* Create the solve analysis information */ 299 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 300 301 /* set the operation */ 302 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 303 304 /* set the matrix */ 305 loTriFactor->csrMat = new CsrMatrix; 306 loTriFactor->csrMat->num_rows = n; 307 loTriFactor->csrMat->num_cols = n; 308 loTriFactor->csrMat->num_entries = nzLower; 309 310 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 311 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 312 313 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 314 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 315 316 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 317 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 318 319 /* perform the solve analysis */ 320 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 321 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 322 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 323 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 324 325 /* assign the pointer. Is this really necessary? */ 326 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 327 328 ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr); 329 ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr); 330 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 331 } catch(char *ex) { 332 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 333 } 334 } 335 PetscFunctionReturn(0); 336 } 337 338 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 339 { 340 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 341 PetscInt n = A->rmap->n; 342 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 343 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 344 cusparseStatus_t stat; 345 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 346 const MatScalar *aa = a->a,*v; 347 PetscInt *AiUp, *AjUp; 348 PetscScalar *AAUp; 349 PetscInt i,nz, nzUpper, offset; 350 PetscErrorCode ierr; 351 352 PetscFunctionBegin; 353 if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) { 354 try { 355 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 356 nzUpper = adiag[0]-adiag[n]; 357 358 /* Allocate Space for the upper triangular matrix */ 359 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 360 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 361 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 362 363 /* Fill the upper triangular matrix */ 364 AiUp[0]=(PetscInt) 0; 365 AiUp[n]=nzUpper; 366 offset = nzUpper; 367 for (i=n-1; i>=0; i--) { 368 v = aa + adiag[i+1] + 1; 369 vi = aj + adiag[i+1] + 1; 370 371 /* number of elements NOT on the diagonal */ 372 nz = adiag[i] - adiag[i+1]-1; 373 374 /* decrement the offset */ 375 offset -= (nz+1); 376 377 /* first, set the diagonal elements */ 378 AjUp[offset] = (PetscInt) i; 379 AAUp[offset] = (MatScalar)1./v[nz]; 380 AiUp[i] = AiUp[i+1] - (nz+1); 381 382 ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 383 ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 384 } 385 386 /* allocate space for the triangular factor information */ 387 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 388 389 /* Create the matrix description */ 390 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 391 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 392 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 393 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 394 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 395 396 /* Create the solve analysis information */ 397 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 398 399 /* set the operation */ 400 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 401 402 /* set the matrix */ 403 upTriFactor->csrMat = new CsrMatrix; 404 upTriFactor->csrMat->num_rows = n; 405 upTriFactor->csrMat->num_cols = n; 406 upTriFactor->csrMat->num_entries = nzUpper; 407 408 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 409 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 410 411 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 412 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 413 414 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 415 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 416 417 /* perform the solve analysis */ 418 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 419 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 420 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 421 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 422 423 /* assign the pointer. Is this really necessary? */ 424 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 425 426 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 427 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 428 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 429 } catch(char *ex) { 430 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 431 } 432 } 433 PetscFunctionReturn(0); 434 } 435 436 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 437 { 438 PetscErrorCode ierr; 439 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 440 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 441 IS isrow = a->row,iscol = a->icol; 442 PetscBool row_identity,col_identity; 443 const PetscInt *r,*c; 444 PetscInt n = A->rmap->n; 445 446 PetscFunctionBegin; 447 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 448 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 449 450 cusparseTriFactors->workVector = new THRUSTARRAY; 451 cusparseTriFactors->workVector->resize(n); 452 cusparseTriFactors->nnz=a->nz; 453 454 A->valid_GPU_matrix = PETSC_CUDA_BOTH; 455 /*lower triangular indices */ 456 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 457 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 458 if (!row_identity) { 459 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 460 cusparseTriFactors->rpermIndices->assign(r, r+n); 461 } 462 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 463 464 /*upper triangular indices */ 465 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 466 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 467 if (!col_identity) { 468 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 469 cusparseTriFactors->cpermIndices->assign(c, c+n); 470 } 471 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 472 PetscFunctionReturn(0); 473 } 474 475 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 476 { 477 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 478 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 479 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 480 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 481 cusparseStatus_t stat; 482 PetscErrorCode ierr; 483 PetscInt *AiUp, *AjUp; 484 PetscScalar *AAUp; 485 PetscScalar *AALo; 486 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 487 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 488 const PetscInt *ai = b->i,*aj = b->j,*vj; 489 const MatScalar *aa = b->a,*v; 490 491 PetscFunctionBegin; 492 if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) { 493 try { 494 /* Allocate Space for the upper triangular matrix */ 495 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 496 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 497 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 498 ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 499 500 /* Fill the upper triangular matrix */ 501 AiUp[0]=(PetscInt) 0; 502 AiUp[n]=nzUpper; 503 offset = 0; 504 for (i=0; i<n; i++) { 505 /* set the pointers */ 506 v = aa + ai[i]; 507 vj = aj + ai[i]; 508 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 509 510 /* first, set the diagonal elements */ 511 AjUp[offset] = (PetscInt) i; 512 AAUp[offset] = (MatScalar)1.0/v[nz]; 513 AiUp[i] = offset; 514 AALo[offset] = (MatScalar)1.0/v[nz]; 515 516 offset+=1; 517 if (nz>0) { 518 ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr); 519 ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 520 for (j=offset; j<offset+nz; j++) { 521 AAUp[j] = -AAUp[j]; 522 AALo[j] = AAUp[j]/v[nz]; 523 } 524 offset+=nz; 525 } 526 } 527 528 /* allocate space for the triangular factor information */ 529 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 530 531 /* Create the matrix description */ 532 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 533 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 534 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 535 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 536 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 537 538 /* Create the solve analysis information */ 539 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 540 541 /* set the operation */ 542 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 543 544 /* set the matrix */ 545 upTriFactor->csrMat = new CsrMatrix; 546 upTriFactor->csrMat->num_rows = A->rmap->n; 547 upTriFactor->csrMat->num_cols = A->cmap->n; 548 upTriFactor->csrMat->num_entries = a->nz; 549 550 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 551 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 552 553 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 554 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 555 556 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 557 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 558 559 /* perform the solve analysis */ 560 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 561 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 562 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 563 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 564 565 /* assign the pointer. Is this really necessary? */ 566 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 567 568 /* allocate space for the triangular factor information */ 569 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 570 571 /* Create the matrix description */ 572 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 573 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 574 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 575 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 576 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 577 578 /* Create the solve analysis information */ 579 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 580 581 /* set the operation */ 582 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 583 584 /* set the matrix */ 585 loTriFactor->csrMat = new CsrMatrix; 586 loTriFactor->csrMat->num_rows = A->rmap->n; 587 loTriFactor->csrMat->num_cols = A->cmap->n; 588 loTriFactor->csrMat->num_entries = a->nz; 589 590 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 591 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 592 593 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 594 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 595 596 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 597 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 598 599 /* perform the solve analysis */ 600 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 601 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 602 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 603 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 604 605 /* assign the pointer. Is this really necessary? */ 606 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 607 608 A->valid_GPU_matrix = PETSC_CUDA_BOTH; 609 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 610 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 611 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 612 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 613 } catch(char *ex) { 614 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 615 } 616 } 617 PetscFunctionReturn(0); 618 } 619 620 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 621 { 622 PetscErrorCode ierr; 623 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 624 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 625 IS ip = a->row; 626 const PetscInt *rip; 627 PetscBool perm_identity; 628 PetscInt n = A->rmap->n; 629 630 PetscFunctionBegin; 631 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 632 cusparseTriFactors->workVector = new THRUSTARRAY; 633 cusparseTriFactors->workVector->resize(n); 634 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 635 636 /*lower triangular indices */ 637 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 638 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 639 if (!perm_identity) { 640 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 641 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 642 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 643 cusparseTriFactors->cpermIndices->assign(rip, rip+n); 644 } 645 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 646 PetscFunctionReturn(0); 647 } 648 649 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 650 { 651 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 652 IS isrow = b->row,iscol = b->col; 653 PetscBool row_identity,col_identity; 654 PetscErrorCode ierr; 655 656 PetscFunctionBegin; 657 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 658 /* determine which version of MatSolve needs to be used. */ 659 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 660 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 661 if (row_identity && col_identity) { 662 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 663 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 664 } else { 665 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 666 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 667 } 668 669 /* get the triangular factors */ 670 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 671 PetscFunctionReturn(0); 672 } 673 674 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 675 { 676 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 677 IS ip = b->row; 678 PetscBool perm_identity; 679 PetscErrorCode ierr; 680 681 PetscFunctionBegin; 682 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 683 684 /* determine which version of MatSolve needs to be used. */ 685 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 686 if (perm_identity) { 687 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 688 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 689 } else { 690 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 691 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 692 } 693 694 /* get the triangular factors */ 695 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 696 PetscFunctionReturn(0); 697 } 698 699 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 700 { 701 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 702 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 703 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 704 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 705 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 706 cusparseStatus_t stat; 707 cusparseIndexBase_t indexBase; 708 cusparseMatrixType_t matrixType; 709 cusparseFillMode_t fillMode; 710 cusparseDiagType_t diagType; 711 712 PetscFunctionBegin; 713 714 /*********************************************/ 715 /* Now the Transpose of the Lower Tri Factor */ 716 /*********************************************/ 717 718 /* allocate space for the transpose of the lower triangular factor */ 719 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 720 721 /* set the matrix descriptors of the lower triangular factor */ 722 matrixType = cusparseGetMatType(loTriFactor->descr); 723 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 724 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 725 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 726 diagType = cusparseGetMatDiagType(loTriFactor->descr); 727 728 /* Create the matrix description */ 729 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat); 730 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat); 731 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat); 732 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat); 733 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat); 734 735 /* Create the solve analysis information */ 736 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat); 737 738 /* set the operation */ 739 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 740 741 /* allocate GPU space for the CSC of the lower triangular factor*/ 742 loTriFactorT->csrMat = new CsrMatrix; 743 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 744 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 745 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 746 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 747 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 748 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 749 750 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 751 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 752 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 753 loTriFactor->csrMat->values->data().get(), 754 loTriFactor->csrMat->row_offsets->data().get(), 755 loTriFactor->csrMat->column_indices->data().get(), 756 loTriFactorT->csrMat->values->data().get(), 757 loTriFactorT->csrMat->column_indices->data().get(), 758 loTriFactorT->csrMat->row_offsets->data().get(), 759 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 760 761 /* perform the solve analysis on the transposed matrix */ 762 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 763 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 764 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 765 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 766 loTriFactorT->solveInfo);CHKERRCUDA(stat); 767 768 /* assign the pointer. Is this really necessary? */ 769 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 770 771 /*********************************************/ 772 /* Now the Transpose of the Upper Tri Factor */ 773 /*********************************************/ 774 775 /* allocate space for the transpose of the upper triangular factor */ 776 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 777 778 /* set the matrix descriptors of the upper triangular factor */ 779 matrixType = cusparseGetMatType(upTriFactor->descr); 780 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 781 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 782 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 783 diagType = cusparseGetMatDiagType(upTriFactor->descr); 784 785 /* Create the matrix description */ 786 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat); 787 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat); 788 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat); 789 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat); 790 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat); 791 792 /* Create the solve analysis information */ 793 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat); 794 795 /* set the operation */ 796 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 797 798 /* allocate GPU space for the CSC of the upper triangular factor*/ 799 upTriFactorT->csrMat = new CsrMatrix; 800 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 801 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 802 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 803 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 804 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 805 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 806 807 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 808 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 809 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 810 upTriFactor->csrMat->values->data().get(), 811 upTriFactor->csrMat->row_offsets->data().get(), 812 upTriFactor->csrMat->column_indices->data().get(), 813 upTriFactorT->csrMat->values->data().get(), 814 upTriFactorT->csrMat->column_indices->data().get(), 815 upTriFactorT->csrMat->row_offsets->data().get(), 816 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 817 818 /* perform the solve analysis on the transposed matrix */ 819 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 820 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 821 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 822 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 823 upTriFactorT->solveInfo);CHKERRCUDA(stat); 824 825 /* assign the pointer. Is this really necessary? */ 826 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 827 PetscFunctionReturn(0); 828 } 829 830 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 831 { 832 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 833 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 834 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 835 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 836 cusparseStatus_t stat; 837 cusparseIndexBase_t indexBase; 838 cudaError_t err; 839 840 PetscFunctionBegin; 841 842 /* allocate space for the triangular factor information */ 843 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 844 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat); 845 indexBase = cusparseGetMatIndexBase(matstruct->descr); 846 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat); 847 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 848 849 /* set alpha and beta */ 850 err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUDA(err); 851 err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 852 err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUDA(err); 853 err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 854 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 855 856 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 857 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 858 CsrMatrix *matrixT= new CsrMatrix; 859 matrixT->num_rows = A->rmap->n; 860 matrixT->num_cols = A->cmap->n; 861 matrixT->num_entries = a->nz; 862 matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 863 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 864 matrixT->values = new THRUSTARRAY(a->nz); 865 866 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 867 indexBase = cusparseGetMatIndexBase(matstruct->descr); 868 stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows, 869 matrix->num_cols, matrix->num_entries, 870 matrix->values->data().get(), 871 matrix->row_offsets->data().get(), 872 matrix->column_indices->data().get(), 873 matrixT->values->data().get(), 874 matrixT->column_indices->data().get(), 875 matrixT->row_offsets->data().get(), 876 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 877 878 /* assign the pointer */ 879 matstructT->mat = matrixT; 880 881 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 882 #if CUDA_VERSION>=5000 883 /* First convert HYB to CSR */ 884 CsrMatrix *temp= new CsrMatrix; 885 temp->num_rows = A->rmap->n; 886 temp->num_cols = A->cmap->n; 887 temp->num_entries = a->nz; 888 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 889 temp->column_indices = new THRUSTINTARRAY32(a->nz); 890 temp->values = new THRUSTARRAY(a->nz); 891 892 893 stat = cusparse_hyb2csr(cusparsestruct->handle, 894 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 895 temp->values->data().get(), 896 temp->row_offsets->data().get(), 897 temp->column_indices->data().get());CHKERRCUDA(stat); 898 899 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 900 CsrMatrix *tempT= new CsrMatrix; 901 tempT->num_rows = A->rmap->n; 902 tempT->num_cols = A->cmap->n; 903 tempT->num_entries = a->nz; 904 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 905 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 906 tempT->values = new THRUSTARRAY(a->nz); 907 908 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 909 temp->num_cols, temp->num_entries, 910 temp->values->data().get(), 911 temp->row_offsets->data().get(), 912 temp->column_indices->data().get(), 913 tempT->values->data().get(), 914 tempT->column_indices->data().get(), 915 tempT->row_offsets->data().get(), 916 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 917 918 /* Last, convert CSC to HYB */ 919 cusparseHybMat_t hybMat; 920 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 921 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 922 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 923 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 924 matstructT->descr, tempT->values->data().get(), 925 tempT->row_offsets->data().get(), 926 tempT->column_indices->data().get(), 927 hybMat, 0, partition);CHKERRCUDA(stat); 928 929 /* assign the pointer */ 930 matstructT->mat = hybMat; 931 932 /* delete temporaries */ 933 if (tempT) { 934 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 935 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 936 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 937 delete (CsrMatrix*) tempT; 938 } 939 if (temp) { 940 if (temp->values) delete (THRUSTARRAY*) temp->values; 941 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 942 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 943 delete (CsrMatrix*) temp; 944 } 945 #else 946 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later."); 947 #endif 948 } 949 /* assign the compressed row indices */ 950 matstructT->cprowIndices = new THRUSTINTARRAY; 951 952 /* assign the pointer */ 953 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 954 PetscFunctionReturn(0); 955 } 956 957 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 958 { 959 PetscInt n = xx->map->n; 960 const PetscScalar *barray; 961 PetscScalar *xarray; 962 thrust::device_ptr<const PetscScalar> bGPU; 963 thrust::device_ptr<PetscScalar> xGPU; 964 cusparseStatus_t stat; 965 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 966 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 967 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 968 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 969 PetscErrorCode ierr; 970 971 PetscFunctionBegin; 972 /* Analyze the matrix and create the transpose ... on the fly */ 973 if (!loTriFactorT && !upTriFactorT) { 974 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 975 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 976 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 977 } 978 979 /* Get the GPU pointers */ 980 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 981 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 982 xGPU = thrust::device_pointer_cast(xarray); 983 bGPU = thrust::device_pointer_cast(barray); 984 985 /* First, reorder with the row permutation */ 986 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 987 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 988 xGPU); 989 990 /* First, solve U */ 991 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 992 upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 993 upTriFactorT->csrMat->values->data().get(), 994 upTriFactorT->csrMat->row_offsets->data().get(), 995 upTriFactorT->csrMat->column_indices->data().get(), 996 upTriFactorT->solveInfo, 997 xarray, tempGPU->data().get());CHKERRCUDA(stat); 998 999 /* Then, solve L */ 1000 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1001 loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 1002 loTriFactorT->csrMat->values->data().get(), 1003 loTriFactorT->csrMat->row_offsets->data().get(), 1004 loTriFactorT->csrMat->column_indices->data().get(), 1005 loTriFactorT->solveInfo, 1006 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1007 1008 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1009 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1010 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1011 tempGPU->begin()); 1012 1013 /* Copy the temporary to the full solution. */ 1014 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1015 1016 /* restore */ 1017 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1018 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1019 ierr = WaitForGPU();CHKERRCUDA(ierr); 1020 1021 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1022 PetscFunctionReturn(0); 1023 } 1024 1025 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1026 { 1027 const PetscScalar *barray; 1028 PetscScalar *xarray; 1029 cusparseStatus_t stat; 1030 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1031 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1032 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1033 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1034 PetscErrorCode ierr; 1035 1036 PetscFunctionBegin; 1037 /* Analyze the matrix and create the transpose ... on the fly */ 1038 if (!loTriFactorT && !upTriFactorT) { 1039 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1040 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1041 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1042 } 1043 1044 /* Get the GPU pointers */ 1045 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1046 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1047 1048 /* First, solve U */ 1049 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1050 upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 1051 upTriFactorT->csrMat->values->data().get(), 1052 upTriFactorT->csrMat->row_offsets->data().get(), 1053 upTriFactorT->csrMat->column_indices->data().get(), 1054 upTriFactorT->solveInfo, 1055 barray, tempGPU->data().get());CHKERRCUDA(stat); 1056 1057 /* Then, solve L */ 1058 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1059 loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 1060 loTriFactorT->csrMat->values->data().get(), 1061 loTriFactorT->csrMat->row_offsets->data().get(), 1062 loTriFactorT->csrMat->column_indices->data().get(), 1063 loTriFactorT->solveInfo, 1064 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1065 1066 /* restore */ 1067 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1068 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1069 ierr = WaitForGPU();CHKERRCUDA(ierr); 1070 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1071 PetscFunctionReturn(0); 1072 } 1073 1074 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1075 { 1076 const PetscScalar *barray; 1077 PetscScalar *xarray; 1078 thrust::device_ptr<const PetscScalar> bGPU; 1079 thrust::device_ptr<PetscScalar> xGPU; 1080 cusparseStatus_t stat; 1081 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1082 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1083 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1084 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1085 PetscErrorCode ierr; 1086 VecType t; 1087 PetscBool flg; 1088 1089 PetscFunctionBegin; 1090 ierr = VecGetType(bb,&t);CHKERRQ(ierr); 1091 ierr = PetscStrcmp(t,VECSEQCUDA,&flg);CHKERRQ(ierr); 1092 if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #2). Can only deal with %s\n.",t,VECSEQCUDA); 1093 ierr = VecGetType(xx,&t);CHKERRQ(ierr); 1094 ierr = PetscStrcmp(t,VECSEQCUDA,&flg);CHKERRQ(ierr); 1095 if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #3). Can only deal with %s\n.",t,VECSEQCUDA); 1096 1097 /* Get the GPU pointers */ 1098 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1099 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1100 xGPU = thrust::device_pointer_cast(xarray); 1101 bGPU = thrust::device_pointer_cast(barray); 1102 1103 /* First, reorder with the row permutation */ 1104 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1105 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1106 xGPU); 1107 1108 /* Next, solve L */ 1109 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1110 loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1111 loTriFactor->csrMat->values->data().get(), 1112 loTriFactor->csrMat->row_offsets->data().get(), 1113 loTriFactor->csrMat->column_indices->data().get(), 1114 loTriFactor->solveInfo, 1115 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1116 1117 /* Then, solve U */ 1118 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1119 upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1120 upTriFactor->csrMat->values->data().get(), 1121 upTriFactor->csrMat->row_offsets->data().get(), 1122 upTriFactor->csrMat->column_indices->data().get(), 1123 upTriFactor->solveInfo, 1124 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1125 1126 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1127 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1128 thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()), 1129 tempGPU->begin()); 1130 1131 /* Copy the temporary to the full solution. */ 1132 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1133 1134 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1135 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1136 ierr = WaitForGPU();CHKERRCUDA(ierr); 1137 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1138 PetscFunctionReturn(0); 1139 } 1140 1141 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1142 { 1143 const PetscScalar *barray; 1144 PetscScalar *xarray; 1145 cusparseStatus_t stat; 1146 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1147 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1148 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1149 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1150 PetscErrorCode ierr; 1151 1152 PetscFunctionBegin; 1153 /* Get the GPU pointers */ 1154 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1155 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1156 1157 /* First, solve L */ 1158 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1159 loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1160 loTriFactor->csrMat->values->data().get(), 1161 loTriFactor->csrMat->row_offsets->data().get(), 1162 loTriFactor->csrMat->column_indices->data().get(), 1163 loTriFactor->solveInfo, 1164 barray, tempGPU->data().get());CHKERRCUDA(stat); 1165 1166 /* Next, solve U */ 1167 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1168 upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1169 upTriFactor->csrMat->values->data().get(), 1170 upTriFactor->csrMat->row_offsets->data().get(), 1171 upTriFactor->csrMat->column_indices->data().get(), 1172 upTriFactor->solveInfo, 1173 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1174 1175 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1176 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1177 ierr = WaitForGPU();CHKERRCUDA(ierr); 1178 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1179 PetscFunctionReturn(0); 1180 } 1181 1182 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1183 { 1184 1185 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1186 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1187 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1188 PetscInt m = A->rmap->n,*ii,*ridx; 1189 PetscErrorCode ierr; 1190 cusparseStatus_t stat; 1191 cudaError_t err; 1192 1193 PetscFunctionBegin; 1194 if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) { 1195 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1196 if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1197 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1198 /* copy values only */ 1199 matrix->values->assign(a->a, a->a+a->nz); 1200 } else { 1201 Mat_SeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format); 1202 try { 1203 cusparsestruct->nonzerorow=0; 1204 for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0); 1205 1206 if (a->compressedrow.use) { 1207 m = a->compressedrow.nrows; 1208 ii = a->compressedrow.i; 1209 ridx = a->compressedrow.rindex; 1210 } else { 1211 /* Forcing compressed row on the GPU */ 1212 int k=0; 1213 ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr); 1214 ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr); 1215 ii[0]=0; 1216 for (int j = 0; j<m; j++) { 1217 if ((a->i[j+1]-a->i[j])>0) { 1218 ii[k] = a->i[j]; 1219 ridx[k]= j; 1220 k++; 1221 } 1222 } 1223 ii[cusparsestruct->nonzerorow] = a->nz; 1224 m = cusparsestruct->nonzerorow; 1225 } 1226 1227 /* allocate space for the triangular factor information */ 1228 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1229 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat); 1230 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 1231 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 1232 1233 err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUDA(err); 1234 err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1235 err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUDA(err); 1236 err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1237 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 1238 1239 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1240 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1241 /* set the matrix */ 1242 CsrMatrix *matrix= new CsrMatrix; 1243 matrix->num_rows = m; 1244 matrix->num_cols = A->cmap->n; 1245 matrix->num_entries = a->nz; 1246 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1247 matrix->row_offsets->assign(ii, ii + m+1); 1248 1249 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1250 matrix->column_indices->assign(a->j, a->j+a->nz); 1251 1252 matrix->values = new THRUSTARRAY(a->nz); 1253 matrix->values->assign(a->a, a->a+a->nz); 1254 1255 /* assign the pointer */ 1256 matstruct->mat = matrix; 1257 1258 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1259 #if CUDA_VERSION>=4020 1260 CsrMatrix *matrix= new CsrMatrix; 1261 matrix->num_rows = m; 1262 matrix->num_cols = A->cmap->n; 1263 matrix->num_entries = a->nz; 1264 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1265 matrix->row_offsets->assign(ii, ii + m+1); 1266 1267 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1268 matrix->column_indices->assign(a->j, a->j+a->nz); 1269 1270 matrix->values = new THRUSTARRAY(a->nz); 1271 matrix->values->assign(a->a, a->a+a->nz); 1272 1273 cusparseHybMat_t hybMat; 1274 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 1275 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1276 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1277 stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1278 matstruct->descr, matrix->values->data().get(), 1279 matrix->row_offsets->data().get(), 1280 matrix->column_indices->data().get(), 1281 hybMat, 0, partition);CHKERRCUDA(stat); 1282 /* assign the pointer */ 1283 matstruct->mat = hybMat; 1284 1285 if (matrix) { 1286 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1287 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1288 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1289 delete (CsrMatrix*)matrix; 1290 } 1291 #endif 1292 } 1293 1294 /* assign the compressed row indices */ 1295 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1296 matstruct->cprowIndices->assign(ridx,ridx+m); 1297 1298 /* assign the pointer */ 1299 cusparsestruct->mat = matstruct; 1300 1301 if (!a->compressedrow.use) { 1302 ierr = PetscFree(ii);CHKERRQ(ierr); 1303 ierr = PetscFree(ridx);CHKERRQ(ierr); 1304 } 1305 cusparsestruct->workVector = new THRUSTARRAY; 1306 cusparsestruct->workVector->resize(m); 1307 } catch(char *ex) { 1308 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1309 } 1310 cusparsestruct->nonzerostate = A->nonzerostate; 1311 } 1312 ierr = WaitForGPU();CHKERRCUDA(ierr); 1313 A->valid_GPU_matrix = PETSC_CUDA_BOTH; 1314 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1315 } 1316 PetscFunctionReturn(0); 1317 } 1318 1319 static PetscErrorCode MatCreateVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left) 1320 { 1321 PetscErrorCode ierr; 1322 PetscInt rbs,cbs; 1323 1324 PetscFunctionBegin; 1325 ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr); 1326 if (right) { 1327 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 1328 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 1329 ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr); 1330 ierr = VecSetType(*right,VECSEQCUDA);CHKERRQ(ierr); 1331 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 1332 } 1333 if (left) { 1334 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 1335 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 1336 ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr); 1337 ierr = VecSetType(*left,VECSEQCUDA);CHKERRQ(ierr); 1338 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 1339 } 1340 PetscFunctionReturn(0); 1341 } 1342 1343 struct VecCUDAPlusEquals 1344 { 1345 template <typename Tuple> 1346 __host__ __device__ 1347 void operator()(Tuple t) 1348 { 1349 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1350 } 1351 }; 1352 1353 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1354 { 1355 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1356 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1357 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1358 const PetscScalar *xarray; 1359 PetscScalar *yarray; 1360 PetscErrorCode ierr; 1361 cusparseStatus_t stat; 1362 1363 PetscFunctionBegin; 1364 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1365 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1366 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1367 ierr = VecSet(yy,0);CHKERRQ(ierr); 1368 ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1369 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1370 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1371 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1372 mat->num_rows, mat->num_cols, mat->num_entries, 1373 matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), 1374 mat->column_indices->data().get(), xarray, matstruct->beta, 1375 yarray);CHKERRCUDA(stat); 1376 } else { 1377 #if CUDA_VERSION>=4020 1378 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1379 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1380 matstruct->alpha, matstruct->descr, hybMat, 1381 xarray, matstruct->beta, 1382 yarray);CHKERRCUDA(stat); 1383 #endif 1384 } 1385 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1386 ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1387 if (!cusparsestruct->stream) { 1388 ierr = WaitForGPU();CHKERRCUDA(ierr); 1389 } 1390 ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1391 PetscFunctionReturn(0); 1392 } 1393 1394 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1395 { 1396 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1397 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1398 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1399 const PetscScalar *xarray; 1400 PetscScalar *yarray; 1401 PetscErrorCode ierr; 1402 cusparseStatus_t stat; 1403 1404 PetscFunctionBegin; 1405 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1406 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1407 if (!matstructT) { 1408 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1409 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1410 } 1411 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1412 ierr = VecSet(yy,0);CHKERRQ(ierr); 1413 ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1414 1415 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1416 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1417 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1418 mat->num_rows, mat->num_cols, 1419 mat->num_entries, matstructT->alpha, matstructT->descr, 1420 mat->values->data().get(), mat->row_offsets->data().get(), 1421 mat->column_indices->data().get(), xarray, matstructT->beta, 1422 yarray);CHKERRCUDA(stat); 1423 } else { 1424 #if CUDA_VERSION>=4020 1425 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1426 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1427 matstructT->alpha, matstructT->descr, hybMat, 1428 xarray, matstructT->beta, 1429 yarray);CHKERRCUDA(stat); 1430 #endif 1431 } 1432 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1433 ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1434 if (!cusparsestruct->stream) { 1435 ierr = WaitForGPU();CHKERRCUDA(ierr); 1436 } 1437 ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1438 PetscFunctionReturn(0); 1439 } 1440 1441 1442 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1443 { 1444 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1445 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1446 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1447 thrust::device_ptr<PetscScalar> zptr; 1448 const PetscScalar *xarray; 1449 PetscScalar *zarray; 1450 PetscErrorCode ierr; 1451 cusparseStatus_t stat; 1452 1453 PetscFunctionBegin; 1454 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1455 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1456 try { 1457 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1458 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1459 ierr = VecCUDAGetArrayReadWrite(zz,&zarray);CHKERRQ(ierr); 1460 zptr = thrust::device_pointer_cast(zarray); 1461 1462 /* multiply add */ 1463 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1464 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1465 /* here we need to be careful to set the number of rows in the multiply to the 1466 number of compressed rows in the matrix ... which is equivalent to the 1467 size of the workVector */ 1468 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1469 mat->num_rows, mat->num_cols, 1470 mat->num_entries, matstruct->alpha, matstruct->descr, 1471 mat->values->data().get(), mat->row_offsets->data().get(), 1472 mat->column_indices->data().get(), xarray, matstruct->beta, 1473 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1474 } else { 1475 #if CUDA_VERSION>=4020 1476 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1477 if (cusparsestruct->workVector->size()) { 1478 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1479 matstruct->alpha, matstruct->descr, hybMat, 1480 xarray, matstruct->beta, 1481 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1482 } 1483 #endif 1484 } 1485 1486 /* scatter the data from the temporary into the full vector with a += operation */ 1487 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 1488 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1489 VecCUDAPlusEquals()); 1490 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1491 ierr = VecCUDARestoreArrayReadWrite(zz,&zarray);CHKERRQ(ierr); 1492 1493 } catch(char *ex) { 1494 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1495 } 1496 ierr = WaitForGPU();CHKERRCUDA(ierr); 1497 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1498 PetscFunctionReturn(0); 1499 } 1500 1501 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1502 { 1503 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1504 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1505 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1506 thrust::device_ptr<PetscScalar> zptr; 1507 const PetscScalar *xarray; 1508 PetscScalar *zarray; 1509 PetscErrorCode ierr; 1510 cusparseStatus_t stat; 1511 1512 PetscFunctionBegin; 1513 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1514 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1515 if (!matstructT) { 1516 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1517 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1518 } 1519 1520 try { 1521 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1522 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1523 ierr = VecCUDAGetArrayReadWrite(zz,&zarray);CHKERRQ(ierr); 1524 zptr = thrust::device_pointer_cast(zarray); 1525 1526 /* multiply add with matrix transpose */ 1527 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1528 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1529 /* here we need to be careful to set the number of rows in the multiply to the 1530 number of compressed rows in the matrix ... which is equivalent to the 1531 size of the workVector */ 1532 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1533 mat->num_rows, mat->num_cols, 1534 mat->num_entries, matstructT->alpha, matstructT->descr, 1535 mat->values->data().get(), mat->row_offsets->data().get(), 1536 mat->column_indices->data().get(), xarray, matstructT->beta, 1537 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1538 } else { 1539 #if CUDA_VERSION>=4020 1540 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1541 if (cusparsestruct->workVector->size()) { 1542 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1543 matstructT->alpha, matstructT->descr, hybMat, 1544 xarray, matstructT->beta, 1545 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1546 } 1547 #endif 1548 } 1549 1550 /* scatter the data from the temporary into the full vector with a += operation */ 1551 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))), 1552 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1553 VecCUDAPlusEquals()); 1554 1555 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1556 ierr = VecCUDARestoreArrayReadWrite(zz,&zarray);CHKERRQ(ierr); 1557 1558 } catch(char *ex) { 1559 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1560 } 1561 ierr = WaitForGPU();CHKERRCUDA(ierr); 1562 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1563 PetscFunctionReturn(0); 1564 } 1565 1566 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1567 { 1568 PetscErrorCode ierr; 1569 1570 PetscFunctionBegin; 1571 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1572 if (A->factortype==MAT_FACTOR_NONE) { 1573 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1574 } 1575 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1576 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1577 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1578 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1579 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1580 PetscFunctionReturn(0); 1581 } 1582 1583 /* --------------------------------------------------------------------------------*/ 1584 /*@ 1585 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1586 (the default parallel PETSc format). This matrix will ultimately pushed down 1587 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1588 assembly performance the user should preallocate the matrix storage by setting 1589 the parameter nz (or the array nnz). By setting these parameters accurately, 1590 performance during matrix assembly can be increased by more than a factor of 50. 1591 1592 Collective on MPI_Comm 1593 1594 Input Parameters: 1595 + comm - MPI communicator, set to PETSC_COMM_SELF 1596 . m - number of rows 1597 . n - number of columns 1598 . nz - number of nonzeros per row (same for all rows) 1599 - nnz - array containing the number of nonzeros in the various rows 1600 (possibly different for each row) or NULL 1601 1602 Output Parameter: 1603 . A - the matrix 1604 1605 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1606 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1607 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1608 1609 Notes: 1610 If nnz is given then nz is ignored 1611 1612 The AIJ format (also called the Yale sparse matrix format or 1613 compressed row storage), is fully compatible with standard Fortran 77 1614 storage. That is, the stored row and column indices can begin at 1615 either one (as in Fortran) or zero. See the users' manual for details. 1616 1617 Specify the preallocated storage with either nz or nnz (not both). 1618 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1619 allocation. For large problems you MUST preallocate memory or you 1620 will get TERRIBLE performance, see the users' manual chapter on matrices. 1621 1622 By default, this format uses inodes (identical nodes) when possible, to 1623 improve numerical efficiency of matrix-vector products and solves. We 1624 search for consecutive rows with the same nonzero structure, thereby 1625 reusing matrix information to achieve increased efficiency. 1626 1627 Level: intermediate 1628 1629 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1630 @*/ 1631 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1632 { 1633 PetscErrorCode ierr; 1634 1635 PetscFunctionBegin; 1636 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1637 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1638 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1639 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1640 PetscFunctionReturn(0); 1641 } 1642 1643 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1644 { 1645 PetscErrorCode ierr; 1646 1647 PetscFunctionBegin; 1648 if (A->factortype==MAT_FACTOR_NONE) { 1649 if (A->valid_GPU_matrix != PETSC_CUDA_UNALLOCATED) { 1650 ierr = Mat_SeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 1651 } 1652 } else { 1653 ierr = Mat_SeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 1654 } 1655 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1656 PetscFunctionReturn(0); 1657 } 1658 1659 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 1660 { 1661 PetscErrorCode ierr; 1662 cusparseStatus_t stat; 1663 cusparseHandle_t handle=0; 1664 1665 PetscFunctionBegin; 1666 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 1667 if (B->factortype==MAT_FACTOR_NONE) { 1668 /* you cannot check the inode.use flag here since the matrix was just created. 1669 now build a GPU matrix data structure */ 1670 B->spptr = new Mat_SeqAIJCUSPARSE; 1671 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1672 ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0; 1673 ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0; 1674 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1675 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1676 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = 0; 1677 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1678 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle; 1679 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1680 } else { 1681 /* NEXT, set the pointers to the triangular factors */ 1682 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1683 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 1684 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1685 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0; 1686 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0; 1687 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0; 1688 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0; 1689 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0; 1690 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = 0; 1691 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1692 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle; 1693 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0; 1694 } 1695 1696 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1697 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1698 B->ops->getvecs = MatCreateVecs_SeqAIJCUSPARSE; 1699 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1700 B->ops->mult = MatMult_SeqAIJCUSPARSE; 1701 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1702 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1703 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1704 1705 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1706 1707 B->valid_GPU_matrix = PETSC_CUDA_UNALLOCATED; 1708 1709 ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1710 PetscFunctionReturn(0); 1711 } 1712 1713 /*M 1714 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1715 1716 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1717 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 1718 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 1719 1720 Options Database Keys: 1721 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1722 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1723 . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1724 1725 Level: beginner 1726 1727 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1728 M*/ 1729 1730 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 1731 1732 1733 PETSC_EXTERN PetscErrorCode MatSolverPackageRegister_CUSPARSE(void) 1734 { 1735 PetscErrorCode ierr; 1736 1737 PetscFunctionBegin; 1738 ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1739 ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1740 ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1741 ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1742 PetscFunctionReturn(0); 1743 } 1744 1745 1746 static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 1747 { 1748 cusparseStatus_t stat; 1749 cusparseHandle_t handle; 1750 1751 PetscFunctionBegin; 1752 if (*cusparsestruct) { 1753 Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format); 1754 Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format); 1755 delete (*cusparsestruct)->workVector; 1756 if (handle = (*cusparsestruct)->handle) { 1757 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1758 } 1759 delete *cusparsestruct; 1760 *cusparsestruct = 0; 1761 } 1762 PetscFunctionReturn(0); 1763 } 1764 1765 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 1766 { 1767 PetscFunctionBegin; 1768 if (*mat) { 1769 delete (*mat)->values; 1770 delete (*mat)->column_indices; 1771 delete (*mat)->row_offsets; 1772 delete *mat; 1773 *mat = 0; 1774 } 1775 PetscFunctionReturn(0); 1776 } 1777 1778 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 1779 { 1780 cusparseStatus_t stat; 1781 PetscErrorCode ierr; 1782 1783 PetscFunctionBegin; 1784 if (*trifactor) { 1785 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); } 1786 if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); } 1787 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 1788 delete *trifactor; 1789 *trifactor = 0; 1790 } 1791 PetscFunctionReturn(0); 1792 } 1793 1794 static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 1795 { 1796 CsrMatrix *mat; 1797 cusparseStatus_t stat; 1798 cudaError_t err; 1799 1800 PetscFunctionBegin; 1801 if (*matstruct) { 1802 if ((*matstruct)->mat) { 1803 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 1804 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 1805 stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat); 1806 } else { 1807 mat = (CsrMatrix*)(*matstruct)->mat; 1808 CsrMatrix_Destroy(&mat); 1809 } 1810 } 1811 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); } 1812 delete (*matstruct)->cprowIndices; 1813 if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); } 1814 if ((*matstruct)->beta) { err=cudaFree((*matstruct)->beta);CHKERRCUDA(err); } 1815 delete *matstruct; 1816 *matstruct = 0; 1817 } 1818 PetscFunctionReturn(0); 1819 } 1820 1821 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 1822 { 1823 cusparseHandle_t handle; 1824 cusparseStatus_t stat; 1825 1826 PetscFunctionBegin; 1827 if (*trifactors) { 1828 Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtr); 1829 Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtr); 1830 Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose); 1831 Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose); 1832 delete (*trifactors)->rpermIndices; 1833 delete (*trifactors)->cpermIndices; 1834 delete (*trifactors)->workVector; 1835 if (handle = (*trifactors)->handle) { 1836 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1837 } 1838 delete *trifactors; 1839 *trifactors = 0; 1840 } 1841 PetscFunctionReturn(0); 1842 } 1843 1844