1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 7 #include <petscconf.h> 8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 9 #include <../src/mat/impls/sbaij/seq/sbaij.h> 10 #include <../src/vec/vec/impls/dvecimpl.h> 11 #include <petsc/private/vecimpl.h> 12 #undef VecType 13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 14 15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 16 17 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 18 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 19 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 20 21 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 22 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 23 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 24 25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 29 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 30 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 31 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 32 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 33 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 34 35 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 36 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 37 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 38 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 39 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 40 41 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 42 { 43 cusparseStatus_t stat; 44 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 45 46 PetscFunctionBegin; 47 cusparsestruct->stream = stream; 48 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat); 49 PetscFunctionReturn(0); 50 } 51 52 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 53 { 54 cusparseStatus_t stat; 55 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 56 57 PetscFunctionBegin; 58 if (cusparsestruct->handle != handle) { 59 if (cusparsestruct->handle) { 60 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat); 61 } 62 cusparsestruct->handle = handle; 63 } 64 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 65 PetscFunctionReturn(0); 66 } 67 68 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 69 { 70 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 71 PetscFunctionBegin; 72 if (cusparsestruct->handle) 73 cusparsestruct->handle = 0; 74 PetscFunctionReturn(0); 75 } 76 77 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 78 { 79 PetscFunctionBegin; 80 *type = MATSOLVERCUSPARSE; 81 PetscFunctionReturn(0); 82 } 83 84 /*MC 85 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 86 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 87 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 88 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 89 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 90 algorithms are not recommended. This class does NOT support direct solver operations. 91 92 Level: beginner 93 94 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 95 M*/ 96 97 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 98 { 99 PetscErrorCode ierr; 100 PetscInt n = A->rmap->n; 101 102 PetscFunctionBegin; 103 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 104 (*B)->factortype = ftype; 105 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 106 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 107 108 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 109 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 110 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 111 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 112 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 113 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 114 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 115 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 116 117 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 118 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 119 PetscFunctionReturn(0); 120 } 121 122 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 123 { 124 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 125 126 PetscFunctionBegin; 127 #if CUDA_VERSION>=4020 128 switch (op) { 129 case MAT_CUSPARSE_MULT: 130 cusparsestruct->format = format; 131 break; 132 case MAT_CUSPARSE_ALL: 133 cusparsestruct->format = format; 134 break; 135 default: 136 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 137 } 138 #else 139 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later."); 140 #endif 141 PetscFunctionReturn(0); 142 } 143 144 /*@ 145 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 146 operation. Only the MatMult operation can use different GPU storage formats 147 for MPIAIJCUSPARSE matrices. 148 Not Collective 149 150 Input Parameters: 151 + A - Matrix of type SEQAIJCUSPARSE 152 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 153 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 154 155 Output Parameter: 156 157 Level: intermediate 158 159 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 160 @*/ 161 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 162 { 163 PetscErrorCode ierr; 164 165 PetscFunctionBegin; 166 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 167 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 168 PetscFunctionReturn(0); 169 } 170 171 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 172 { 173 PetscErrorCode ierr; 174 MatCUSPARSEStorageFormat format; 175 PetscBool flg; 176 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 177 178 PetscFunctionBegin; 179 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 180 ierr = PetscObjectOptionsBegin((PetscObject)A); 181 if (A->factortype==MAT_FACTOR_NONE) { 182 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 183 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 184 if (flg) { 185 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 186 } 187 } 188 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 189 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 190 if (flg) { 191 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 192 } 193 ierr = PetscOptionsEnd();CHKERRQ(ierr); 194 PetscFunctionReturn(0); 195 196 } 197 198 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 199 { 200 PetscErrorCode ierr; 201 202 PetscFunctionBegin; 203 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 204 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 205 PetscFunctionReturn(0); 206 } 207 208 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 209 { 210 PetscErrorCode ierr; 211 212 PetscFunctionBegin; 213 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 214 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 215 PetscFunctionReturn(0); 216 } 217 218 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 219 { 220 PetscErrorCode ierr; 221 222 PetscFunctionBegin; 223 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 224 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 225 PetscFunctionReturn(0); 226 } 227 228 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 229 { 230 PetscErrorCode ierr; 231 232 PetscFunctionBegin; 233 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 234 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 235 PetscFunctionReturn(0); 236 } 237 238 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 239 { 240 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 241 PetscInt n = A->rmap->n; 242 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 243 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 244 cusparseStatus_t stat; 245 const PetscInt *ai = a->i,*aj = a->j,*vi; 246 const MatScalar *aa = a->a,*v; 247 PetscInt *AiLo, *AjLo; 248 PetscScalar *AALo; 249 PetscInt i,nz, nzLower, offset, rowOffset; 250 PetscErrorCode ierr; 251 252 PetscFunctionBegin; 253 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 254 try { 255 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 256 nzLower=n+ai[n]-ai[1]; 257 258 /* Allocate Space for the lower triangular matrix */ 259 ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 260 ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr); 261 ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr); 262 263 /* Fill the lower triangular matrix */ 264 AiLo[0] = (PetscInt) 0; 265 AiLo[n] = nzLower; 266 AjLo[0] = (PetscInt) 0; 267 AALo[0] = (MatScalar) 1.0; 268 v = aa; 269 vi = aj; 270 offset = 1; 271 rowOffset= 1; 272 for (i=1; i<n; i++) { 273 nz = ai[i+1] - ai[i]; 274 /* additional 1 for the term on the diagonal */ 275 AiLo[i] = rowOffset; 276 rowOffset += nz+1; 277 278 ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 279 ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 280 281 offset += nz; 282 AjLo[offset] = (PetscInt) i; 283 AALo[offset] = (MatScalar) 1.0; 284 offset += 1; 285 286 v += nz; 287 vi += nz; 288 } 289 290 /* allocate space for the triangular factor information */ 291 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 292 293 /* Create the matrix description */ 294 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 295 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 296 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 297 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat); 298 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 299 300 /* Create the solve analysis information */ 301 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 302 303 /* set the operation */ 304 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 305 306 /* set the matrix */ 307 loTriFactor->csrMat = new CsrMatrix; 308 loTriFactor->csrMat->num_rows = n; 309 loTriFactor->csrMat->num_cols = n; 310 loTriFactor->csrMat->num_entries = nzLower; 311 312 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 313 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 314 315 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 316 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 317 318 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 319 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 320 321 /* perform the solve analysis */ 322 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 323 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 324 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 325 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 326 327 /* assign the pointer. Is this really necessary? */ 328 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 329 330 ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr); 331 ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr); 332 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 333 } catch(char *ex) { 334 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 335 } 336 } 337 PetscFunctionReturn(0); 338 } 339 340 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 341 { 342 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 343 PetscInt n = A->rmap->n; 344 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 345 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 346 cusparseStatus_t stat; 347 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 348 const MatScalar *aa = a->a,*v; 349 PetscInt *AiUp, *AjUp; 350 PetscScalar *AAUp; 351 PetscInt i,nz, nzUpper, offset; 352 PetscErrorCode ierr; 353 354 PetscFunctionBegin; 355 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 356 try { 357 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 358 nzUpper = adiag[0]-adiag[n]; 359 360 /* Allocate Space for the upper triangular matrix */ 361 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 362 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 363 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 364 365 /* Fill the upper triangular matrix */ 366 AiUp[0]=(PetscInt) 0; 367 AiUp[n]=nzUpper; 368 offset = nzUpper; 369 for (i=n-1; i>=0; i--) { 370 v = aa + adiag[i+1] + 1; 371 vi = aj + adiag[i+1] + 1; 372 373 /* number of elements NOT on the diagonal */ 374 nz = adiag[i] - adiag[i+1]-1; 375 376 /* decrement the offset */ 377 offset -= (nz+1); 378 379 /* first, set the diagonal elements */ 380 AjUp[offset] = (PetscInt) i; 381 AAUp[offset] = (MatScalar)1./v[nz]; 382 AiUp[i] = AiUp[i+1] - (nz+1); 383 384 ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 385 ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 386 } 387 388 /* allocate space for the triangular factor information */ 389 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 390 391 /* Create the matrix description */ 392 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 393 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 394 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 395 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 396 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 397 398 /* Create the solve analysis information */ 399 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 400 401 /* set the operation */ 402 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 403 404 /* set the matrix */ 405 upTriFactor->csrMat = new CsrMatrix; 406 upTriFactor->csrMat->num_rows = n; 407 upTriFactor->csrMat->num_cols = n; 408 upTriFactor->csrMat->num_entries = nzUpper; 409 410 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 411 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 412 413 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 414 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 415 416 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 417 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 418 419 /* perform the solve analysis */ 420 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 421 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 422 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 423 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 424 425 /* assign the pointer. Is this really necessary? */ 426 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 427 428 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 429 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 430 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 431 } catch(char *ex) { 432 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 433 } 434 } 435 PetscFunctionReturn(0); 436 } 437 438 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 439 { 440 PetscErrorCode ierr; 441 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 442 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 443 IS isrow = a->row,iscol = a->icol; 444 PetscBool row_identity,col_identity; 445 const PetscInt *r,*c; 446 PetscInt n = A->rmap->n; 447 448 PetscFunctionBegin; 449 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 450 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 451 452 cusparseTriFactors->workVector = new THRUSTARRAY(n); 453 cusparseTriFactors->nnz=a->nz; 454 455 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 456 /*lower triangular indices */ 457 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 458 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 459 if (!row_identity) { 460 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 461 cusparseTriFactors->rpermIndices->assign(r, r+n); 462 } 463 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 464 465 /*upper triangular indices */ 466 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 467 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 468 if (!col_identity) { 469 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 470 cusparseTriFactors->cpermIndices->assign(c, c+n); 471 } 472 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 473 PetscFunctionReturn(0); 474 } 475 476 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 477 { 478 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 479 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 480 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 481 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 482 cusparseStatus_t stat; 483 PetscErrorCode ierr; 484 PetscInt *AiUp, *AjUp; 485 PetscScalar *AAUp; 486 PetscScalar *AALo; 487 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 488 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 489 const PetscInt *ai = b->i,*aj = b->j,*vj; 490 const MatScalar *aa = b->a,*v; 491 492 PetscFunctionBegin; 493 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 494 try { 495 /* Allocate Space for the upper triangular matrix */ 496 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 497 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 498 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 499 ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 500 501 /* Fill the upper triangular matrix */ 502 AiUp[0]=(PetscInt) 0; 503 AiUp[n]=nzUpper; 504 offset = 0; 505 for (i=0; i<n; i++) { 506 /* set the pointers */ 507 v = aa + ai[i]; 508 vj = aj + ai[i]; 509 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 510 511 /* first, set the diagonal elements */ 512 AjUp[offset] = (PetscInt) i; 513 AAUp[offset] = (MatScalar)1.0/v[nz]; 514 AiUp[i] = offset; 515 AALo[offset] = (MatScalar)1.0/v[nz]; 516 517 offset+=1; 518 if (nz>0) { 519 ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr); 520 ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 521 for (j=offset; j<offset+nz; j++) { 522 AAUp[j] = -AAUp[j]; 523 AALo[j] = AAUp[j]/v[nz]; 524 } 525 offset+=nz; 526 } 527 } 528 529 /* allocate space for the triangular factor information */ 530 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 531 532 /* Create the matrix description */ 533 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 534 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 535 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 536 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 537 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 538 539 /* Create the solve analysis information */ 540 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 541 542 /* set the operation */ 543 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 544 545 /* set the matrix */ 546 upTriFactor->csrMat = new CsrMatrix; 547 upTriFactor->csrMat->num_rows = A->rmap->n; 548 upTriFactor->csrMat->num_cols = A->cmap->n; 549 upTriFactor->csrMat->num_entries = a->nz; 550 551 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 552 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 553 554 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 555 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 556 557 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 558 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 559 560 /* perform the solve analysis */ 561 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 562 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 563 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 564 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 565 566 /* assign the pointer. Is this really necessary? */ 567 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 568 569 /* allocate space for the triangular factor information */ 570 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 571 572 /* Create the matrix description */ 573 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 574 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 575 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 576 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 577 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 578 579 /* Create the solve analysis information */ 580 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 581 582 /* set the operation */ 583 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 584 585 /* set the matrix */ 586 loTriFactor->csrMat = new CsrMatrix; 587 loTriFactor->csrMat->num_rows = A->rmap->n; 588 loTriFactor->csrMat->num_cols = A->cmap->n; 589 loTriFactor->csrMat->num_entries = a->nz; 590 591 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 592 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 593 594 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 595 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 596 597 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 598 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 599 600 /* perform the solve analysis */ 601 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 602 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 603 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 604 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 605 606 /* assign the pointer. Is this really necessary? */ 607 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 608 609 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 610 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 611 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 612 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 613 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 614 } catch(char *ex) { 615 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 616 } 617 } 618 PetscFunctionReturn(0); 619 } 620 621 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 622 { 623 PetscErrorCode ierr; 624 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 625 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 626 IS ip = a->row; 627 const PetscInt *rip; 628 PetscBool perm_identity; 629 PetscInt n = A->rmap->n; 630 631 PetscFunctionBegin; 632 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 633 cusparseTriFactors->workVector = new THRUSTARRAY(n); 634 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 635 636 /*lower triangular indices */ 637 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 638 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 639 if (!perm_identity) { 640 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 641 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 642 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 643 cusparseTriFactors->cpermIndices->assign(rip, rip+n); 644 } 645 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 646 PetscFunctionReturn(0); 647 } 648 649 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 650 { 651 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 652 IS isrow = b->row,iscol = b->col; 653 PetscBool row_identity,col_identity; 654 PetscErrorCode ierr; 655 656 PetscFunctionBegin; 657 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 658 /* determine which version of MatSolve needs to be used. */ 659 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 660 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 661 if (row_identity && col_identity) { 662 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 663 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 664 } else { 665 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 666 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 667 } 668 669 /* get the triangular factors */ 670 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 671 PetscFunctionReturn(0); 672 } 673 674 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 675 { 676 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 677 IS ip = b->row; 678 PetscBool perm_identity; 679 PetscErrorCode ierr; 680 681 PetscFunctionBegin; 682 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 683 684 /* determine which version of MatSolve needs to be used. */ 685 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 686 if (perm_identity) { 687 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 688 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 689 } else { 690 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 691 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 692 } 693 694 /* get the triangular factors */ 695 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 696 PetscFunctionReturn(0); 697 } 698 699 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 700 { 701 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 702 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 703 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 704 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 705 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 706 cusparseStatus_t stat; 707 cusparseIndexBase_t indexBase; 708 cusparseMatrixType_t matrixType; 709 cusparseFillMode_t fillMode; 710 cusparseDiagType_t diagType; 711 712 PetscFunctionBegin; 713 714 /*********************************************/ 715 /* Now the Transpose of the Lower Tri Factor */ 716 /*********************************************/ 717 718 /* allocate space for the transpose of the lower triangular factor */ 719 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 720 721 /* set the matrix descriptors of the lower triangular factor */ 722 matrixType = cusparseGetMatType(loTriFactor->descr); 723 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 724 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 725 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 726 diagType = cusparseGetMatDiagType(loTriFactor->descr); 727 728 /* Create the matrix description */ 729 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat); 730 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat); 731 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat); 732 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat); 733 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat); 734 735 /* Create the solve analysis information */ 736 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat); 737 738 /* set the operation */ 739 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 740 741 /* allocate GPU space for the CSC of the lower triangular factor*/ 742 loTriFactorT->csrMat = new CsrMatrix; 743 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 744 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 745 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 746 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 747 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 748 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 749 750 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 751 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 752 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 753 loTriFactor->csrMat->values->data().get(), 754 loTriFactor->csrMat->row_offsets->data().get(), 755 loTriFactor->csrMat->column_indices->data().get(), 756 loTriFactorT->csrMat->values->data().get(), 757 loTriFactorT->csrMat->column_indices->data().get(), 758 loTriFactorT->csrMat->row_offsets->data().get(), 759 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 760 761 /* perform the solve analysis on the transposed matrix */ 762 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 763 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 764 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 765 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 766 loTriFactorT->solveInfo);CHKERRCUDA(stat); 767 768 /* assign the pointer. Is this really necessary? */ 769 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 770 771 /*********************************************/ 772 /* Now the Transpose of the Upper Tri Factor */ 773 /*********************************************/ 774 775 /* allocate space for the transpose of the upper triangular factor */ 776 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 777 778 /* set the matrix descriptors of the upper triangular factor */ 779 matrixType = cusparseGetMatType(upTriFactor->descr); 780 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 781 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 782 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 783 diagType = cusparseGetMatDiagType(upTriFactor->descr); 784 785 /* Create the matrix description */ 786 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat); 787 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat); 788 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat); 789 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat); 790 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat); 791 792 /* Create the solve analysis information */ 793 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat); 794 795 /* set the operation */ 796 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 797 798 /* allocate GPU space for the CSC of the upper triangular factor*/ 799 upTriFactorT->csrMat = new CsrMatrix; 800 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 801 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 802 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 803 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 804 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 805 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 806 807 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 808 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 809 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 810 upTriFactor->csrMat->values->data().get(), 811 upTriFactor->csrMat->row_offsets->data().get(), 812 upTriFactor->csrMat->column_indices->data().get(), 813 upTriFactorT->csrMat->values->data().get(), 814 upTriFactorT->csrMat->column_indices->data().get(), 815 upTriFactorT->csrMat->row_offsets->data().get(), 816 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 817 818 /* perform the solve analysis on the transposed matrix */ 819 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 820 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 821 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 822 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 823 upTriFactorT->solveInfo);CHKERRCUDA(stat); 824 825 /* assign the pointer. Is this really necessary? */ 826 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 827 PetscFunctionReturn(0); 828 } 829 830 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 831 { 832 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 833 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 834 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 835 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 836 cusparseStatus_t stat; 837 cusparseIndexBase_t indexBase; 838 cudaError_t err; 839 840 PetscFunctionBegin; 841 842 /* allocate space for the triangular factor information */ 843 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 844 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat); 845 indexBase = cusparseGetMatIndexBase(matstruct->descr); 846 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat); 847 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 848 849 /* set alpha and beta */ 850 err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUDA(err); 851 err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 852 err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUDA(err); 853 err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 854 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 855 856 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 857 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 858 CsrMatrix *matrixT= new CsrMatrix; 859 matrixT->num_rows = A->cmap->n; 860 matrixT->num_cols = A->rmap->n; 861 matrixT->num_entries = a->nz; 862 matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 863 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 864 matrixT->values = new THRUSTARRAY(a->nz); 865 866 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 867 indexBase = cusparseGetMatIndexBase(matstruct->descr); 868 stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows, 869 matrix->num_cols, matrix->num_entries, 870 matrix->values->data().get(), 871 matrix->row_offsets->data().get(), 872 matrix->column_indices->data().get(), 873 matrixT->values->data().get(), 874 matrixT->column_indices->data().get(), 875 matrixT->row_offsets->data().get(), 876 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 877 878 /* assign the pointer */ 879 matstructT->mat = matrixT; 880 881 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 882 #if CUDA_VERSION>=5000 883 /* First convert HYB to CSR */ 884 CsrMatrix *temp= new CsrMatrix; 885 temp->num_rows = A->rmap->n; 886 temp->num_cols = A->cmap->n; 887 temp->num_entries = a->nz; 888 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 889 temp->column_indices = new THRUSTINTARRAY32(a->nz); 890 temp->values = new THRUSTARRAY(a->nz); 891 892 893 stat = cusparse_hyb2csr(cusparsestruct->handle, 894 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 895 temp->values->data().get(), 896 temp->row_offsets->data().get(), 897 temp->column_indices->data().get());CHKERRCUDA(stat); 898 899 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 900 CsrMatrix *tempT= new CsrMatrix; 901 tempT->num_rows = A->rmap->n; 902 tempT->num_cols = A->cmap->n; 903 tempT->num_entries = a->nz; 904 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 905 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 906 tempT->values = new THRUSTARRAY(a->nz); 907 908 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 909 temp->num_cols, temp->num_entries, 910 temp->values->data().get(), 911 temp->row_offsets->data().get(), 912 temp->column_indices->data().get(), 913 tempT->values->data().get(), 914 tempT->column_indices->data().get(), 915 tempT->row_offsets->data().get(), 916 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 917 918 /* Last, convert CSC to HYB */ 919 cusparseHybMat_t hybMat; 920 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 921 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 922 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 923 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 924 matstructT->descr, tempT->values->data().get(), 925 tempT->row_offsets->data().get(), 926 tempT->column_indices->data().get(), 927 hybMat, 0, partition);CHKERRCUDA(stat); 928 929 /* assign the pointer */ 930 matstructT->mat = hybMat; 931 932 /* delete temporaries */ 933 if (tempT) { 934 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 935 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 936 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 937 delete (CsrMatrix*) tempT; 938 } 939 if (temp) { 940 if (temp->values) delete (THRUSTARRAY*) temp->values; 941 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 942 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 943 delete (CsrMatrix*) temp; 944 } 945 #else 946 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later."); 947 #endif 948 } 949 /* assign the compressed row indices */ 950 matstructT->cprowIndices = new THRUSTINTARRAY; 951 matstructT->cprowIndices->resize(A->cmap->n); 952 thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end()); 953 954 /* assign the pointer */ 955 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 956 PetscFunctionReturn(0); 957 } 958 959 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 960 { 961 PetscInt n = xx->map->n; 962 const PetscScalar *barray; 963 PetscScalar *xarray; 964 thrust::device_ptr<const PetscScalar> bGPU; 965 thrust::device_ptr<PetscScalar> xGPU; 966 cusparseStatus_t stat; 967 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 968 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 969 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 970 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 971 PetscErrorCode ierr; 972 973 PetscFunctionBegin; 974 /* Analyze the matrix and create the transpose ... on the fly */ 975 if (!loTriFactorT && !upTriFactorT) { 976 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 977 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 978 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 979 } 980 981 /* Get the GPU pointers */ 982 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 983 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 984 xGPU = thrust::device_pointer_cast(xarray); 985 bGPU = thrust::device_pointer_cast(barray); 986 987 /* First, reorder with the row permutation */ 988 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 989 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 990 xGPU); 991 992 /* First, solve U */ 993 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 994 upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 995 upTriFactorT->csrMat->values->data().get(), 996 upTriFactorT->csrMat->row_offsets->data().get(), 997 upTriFactorT->csrMat->column_indices->data().get(), 998 upTriFactorT->solveInfo, 999 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1000 1001 /* Then, solve L */ 1002 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1003 loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 1004 loTriFactorT->csrMat->values->data().get(), 1005 loTriFactorT->csrMat->row_offsets->data().get(), 1006 loTriFactorT->csrMat->column_indices->data().get(), 1007 loTriFactorT->solveInfo, 1008 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1009 1010 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1011 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1012 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1013 tempGPU->begin()); 1014 1015 /* Copy the temporary to the full solution. */ 1016 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1017 1018 /* restore */ 1019 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1020 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1021 ierr = WaitForGPU();CHKERRCUDA(ierr); 1022 1023 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1024 PetscFunctionReturn(0); 1025 } 1026 1027 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1028 { 1029 const PetscScalar *barray; 1030 PetscScalar *xarray; 1031 cusparseStatus_t stat; 1032 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1033 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1034 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1035 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1036 PetscErrorCode ierr; 1037 1038 PetscFunctionBegin; 1039 /* Analyze the matrix and create the transpose ... on the fly */ 1040 if (!loTriFactorT && !upTriFactorT) { 1041 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1042 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1043 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1044 } 1045 1046 /* Get the GPU pointers */ 1047 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1048 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1049 1050 /* First, solve U */ 1051 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1052 upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 1053 upTriFactorT->csrMat->values->data().get(), 1054 upTriFactorT->csrMat->row_offsets->data().get(), 1055 upTriFactorT->csrMat->column_indices->data().get(), 1056 upTriFactorT->solveInfo, 1057 barray, tempGPU->data().get());CHKERRCUDA(stat); 1058 1059 /* Then, solve L */ 1060 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1061 loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 1062 loTriFactorT->csrMat->values->data().get(), 1063 loTriFactorT->csrMat->row_offsets->data().get(), 1064 loTriFactorT->csrMat->column_indices->data().get(), 1065 loTriFactorT->solveInfo, 1066 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1067 1068 /* restore */ 1069 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1070 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1071 ierr = WaitForGPU();CHKERRCUDA(ierr); 1072 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1073 PetscFunctionReturn(0); 1074 } 1075 1076 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1077 { 1078 const PetscScalar *barray; 1079 PetscScalar *xarray; 1080 thrust::device_ptr<const PetscScalar> bGPU; 1081 thrust::device_ptr<PetscScalar> xGPU; 1082 cusparseStatus_t stat; 1083 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1084 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1085 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1086 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1087 PetscErrorCode ierr; 1088 1089 PetscFunctionBegin; 1090 1091 /* Get the GPU pointers */ 1092 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1093 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1094 xGPU = thrust::device_pointer_cast(xarray); 1095 bGPU = thrust::device_pointer_cast(barray); 1096 1097 /* First, reorder with the row permutation */ 1098 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1099 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1100 xGPU); 1101 1102 /* Next, solve L */ 1103 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1104 loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1105 loTriFactor->csrMat->values->data().get(), 1106 loTriFactor->csrMat->row_offsets->data().get(), 1107 loTriFactor->csrMat->column_indices->data().get(), 1108 loTriFactor->solveInfo, 1109 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1110 1111 /* Then, solve U */ 1112 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1113 upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1114 upTriFactor->csrMat->values->data().get(), 1115 upTriFactor->csrMat->row_offsets->data().get(), 1116 upTriFactor->csrMat->column_indices->data().get(), 1117 upTriFactor->solveInfo, 1118 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1119 1120 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1121 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1122 thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()), 1123 tempGPU->begin()); 1124 1125 /* Copy the temporary to the full solution. */ 1126 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1127 1128 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1129 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1130 ierr = WaitForGPU();CHKERRCUDA(ierr); 1131 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1132 PetscFunctionReturn(0); 1133 } 1134 1135 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1136 { 1137 const PetscScalar *barray; 1138 PetscScalar *xarray; 1139 cusparseStatus_t stat; 1140 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1141 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1142 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1143 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1144 PetscErrorCode ierr; 1145 1146 PetscFunctionBegin; 1147 /* Get the GPU pointers */ 1148 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1149 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1150 1151 /* First, solve L */ 1152 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1153 loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1154 loTriFactor->csrMat->values->data().get(), 1155 loTriFactor->csrMat->row_offsets->data().get(), 1156 loTriFactor->csrMat->column_indices->data().get(), 1157 loTriFactor->solveInfo, 1158 barray, tempGPU->data().get());CHKERRCUDA(stat); 1159 1160 /* Next, solve U */ 1161 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1162 upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1163 upTriFactor->csrMat->values->data().get(), 1164 upTriFactor->csrMat->row_offsets->data().get(), 1165 upTriFactor->csrMat->column_indices->data().get(), 1166 upTriFactor->solveInfo, 1167 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1168 1169 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1170 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1171 ierr = WaitForGPU();CHKERRCUDA(ierr); 1172 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1173 PetscFunctionReturn(0); 1174 } 1175 1176 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1177 { 1178 1179 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1180 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1181 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1182 PetscInt m = A->rmap->n,*ii,*ridx; 1183 PetscErrorCode ierr; 1184 cusparseStatus_t stat; 1185 cudaError_t err; 1186 1187 PetscFunctionBegin; 1188 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 1189 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1190 if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1191 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1192 /* copy values only */ 1193 matrix->values->assign(a->a, a->a+a->nz); 1194 } else { 1195 MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format); 1196 try { 1197 cusparsestruct->nonzerorow=0; 1198 for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0); 1199 1200 if (a->compressedrow.use) { 1201 m = a->compressedrow.nrows; 1202 ii = a->compressedrow.i; 1203 ridx = a->compressedrow.rindex; 1204 } else { 1205 /* Forcing compressed row on the GPU */ 1206 int k=0; 1207 ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr); 1208 ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr); 1209 ii[0]=0; 1210 for (int j = 0; j<m; j++) { 1211 if ((a->i[j+1]-a->i[j])>0) { 1212 ii[k] = a->i[j]; 1213 ridx[k]= j; 1214 k++; 1215 } 1216 } 1217 ii[cusparsestruct->nonzerorow] = a->nz; 1218 m = cusparsestruct->nonzerorow; 1219 } 1220 1221 /* allocate space for the triangular factor information */ 1222 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1223 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat); 1224 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 1225 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 1226 1227 err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUDA(err); 1228 err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1229 err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUDA(err); 1230 err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1231 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 1232 1233 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1234 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1235 /* set the matrix */ 1236 CsrMatrix *matrix= new CsrMatrix; 1237 matrix->num_rows = m; 1238 matrix->num_cols = A->cmap->n; 1239 matrix->num_entries = a->nz; 1240 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1241 matrix->row_offsets->assign(ii, ii + m+1); 1242 1243 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1244 matrix->column_indices->assign(a->j, a->j+a->nz); 1245 1246 matrix->values = new THRUSTARRAY(a->nz); 1247 matrix->values->assign(a->a, a->a+a->nz); 1248 1249 /* assign the pointer */ 1250 matstruct->mat = matrix; 1251 1252 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1253 #if CUDA_VERSION>=4020 1254 CsrMatrix *matrix= new CsrMatrix; 1255 matrix->num_rows = m; 1256 matrix->num_cols = A->cmap->n; 1257 matrix->num_entries = a->nz; 1258 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1259 matrix->row_offsets->assign(ii, ii + m+1); 1260 1261 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1262 matrix->column_indices->assign(a->j, a->j+a->nz); 1263 1264 matrix->values = new THRUSTARRAY(a->nz); 1265 matrix->values->assign(a->a, a->a+a->nz); 1266 1267 cusparseHybMat_t hybMat; 1268 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 1269 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1270 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1271 stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1272 matstruct->descr, matrix->values->data().get(), 1273 matrix->row_offsets->data().get(), 1274 matrix->column_indices->data().get(), 1275 hybMat, 0, partition);CHKERRCUDA(stat); 1276 /* assign the pointer */ 1277 matstruct->mat = hybMat; 1278 1279 if (matrix) { 1280 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1281 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1282 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1283 delete (CsrMatrix*)matrix; 1284 } 1285 #endif 1286 } 1287 1288 /* assign the compressed row indices */ 1289 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1290 matstruct->cprowIndices->assign(ridx,ridx+m); 1291 1292 /* assign the pointer */ 1293 cusparsestruct->mat = matstruct; 1294 1295 if (!a->compressedrow.use) { 1296 ierr = PetscFree(ii);CHKERRQ(ierr); 1297 ierr = PetscFree(ridx);CHKERRQ(ierr); 1298 } 1299 cusparsestruct->workVector = new THRUSTARRAY(m); 1300 } catch(char *ex) { 1301 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1302 } 1303 cusparsestruct->nonzerostate = A->nonzerostate; 1304 } 1305 ierr = WaitForGPU();CHKERRCUDA(ierr); 1306 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 1307 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1308 } 1309 PetscFunctionReturn(0); 1310 } 1311 1312 struct VecCUDAPlusEquals 1313 { 1314 template <typename Tuple> 1315 __host__ __device__ 1316 void operator()(Tuple t) 1317 { 1318 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1319 } 1320 }; 1321 1322 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1323 { 1324 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1325 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1326 Mat_SeqAIJCUSPARSEMultStruct *matstruct; /* Do not initialize yet, because GPU data may not be available yet */ 1327 const PetscScalar *xarray; 1328 PetscScalar *yarray; 1329 PetscErrorCode ierr; 1330 cusparseStatus_t stat; 1331 1332 PetscFunctionBegin; 1333 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1334 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1335 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1336 ierr = VecSet(yy,0);CHKERRQ(ierr); 1337 ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1338 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1339 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1340 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1341 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1342 mat->num_rows, mat->num_cols, mat->num_entries, 1343 matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), 1344 mat->column_indices->data().get(), xarray, matstruct->beta, 1345 yarray);CHKERRCUDA(stat); 1346 } else { 1347 #if CUDA_VERSION>=4020 1348 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1349 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1350 matstruct->alpha, matstruct->descr, hybMat, 1351 xarray, matstruct->beta, 1352 yarray);CHKERRCUDA(stat); 1353 #endif 1354 } 1355 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1356 ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1357 if (!cusparsestruct->stream) { 1358 ierr = WaitForGPU();CHKERRCUDA(ierr); 1359 } 1360 ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1361 PetscFunctionReturn(0); 1362 } 1363 1364 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1365 { 1366 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1367 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1368 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1369 const PetscScalar *xarray; 1370 PetscScalar *yarray; 1371 PetscErrorCode ierr; 1372 cusparseStatus_t stat; 1373 1374 PetscFunctionBegin; 1375 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1376 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1377 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1378 if (!matstructT) { 1379 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1380 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1381 } 1382 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1383 ierr = VecSet(yy,0);CHKERRQ(ierr); 1384 ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1385 1386 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1387 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1388 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1389 mat->num_rows, mat->num_cols, 1390 mat->num_entries, matstructT->alpha, matstructT->descr, 1391 mat->values->data().get(), mat->row_offsets->data().get(), 1392 mat->column_indices->data().get(), xarray, matstructT->beta, 1393 yarray);CHKERRCUDA(stat); 1394 } else { 1395 #if CUDA_VERSION>=4020 1396 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1397 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1398 matstructT->alpha, matstructT->descr, hybMat, 1399 xarray, matstructT->beta, 1400 yarray);CHKERRCUDA(stat); 1401 #endif 1402 } 1403 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1404 ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1405 if (!cusparsestruct->stream) { 1406 ierr = WaitForGPU();CHKERRCUDA(ierr); 1407 } 1408 ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1409 PetscFunctionReturn(0); 1410 } 1411 1412 1413 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1414 { 1415 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1416 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1417 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 1418 thrust::device_ptr<PetscScalar> zptr; 1419 const PetscScalar *xarray; 1420 PetscScalar *zarray; 1421 PetscErrorCode ierr; 1422 cusparseStatus_t stat; 1423 1424 PetscFunctionBegin; 1425 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1426 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1427 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1428 try { 1429 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1430 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1431 ierr = VecCUDAGetArrayReadWrite(zz,&zarray);CHKERRQ(ierr); 1432 zptr = thrust::device_pointer_cast(zarray); 1433 1434 /* multiply add */ 1435 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1436 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1437 /* here we need to be careful to set the number of rows in the multiply to the 1438 number of compressed rows in the matrix ... which is equivalent to the 1439 size of the workVector */ 1440 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1441 mat->num_rows, mat->num_cols, 1442 mat->num_entries, matstruct->alpha, matstruct->descr, 1443 mat->values->data().get(), mat->row_offsets->data().get(), 1444 mat->column_indices->data().get(), xarray, matstruct->beta, 1445 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1446 } else { 1447 #if CUDA_VERSION>=4020 1448 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1449 if (cusparsestruct->workVector->size()) { 1450 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1451 matstruct->alpha, matstruct->descr, hybMat, 1452 xarray, matstruct->beta, 1453 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1454 } 1455 #endif 1456 } 1457 1458 /* scatter the data from the temporary into the full vector with a += operation */ 1459 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 1460 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1461 VecCUDAPlusEquals()); 1462 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1463 ierr = VecCUDARestoreArrayReadWrite(zz,&zarray);CHKERRQ(ierr); 1464 1465 } catch(char *ex) { 1466 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1467 } 1468 ierr = WaitForGPU();CHKERRCUDA(ierr); 1469 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1470 PetscFunctionReturn(0); 1471 } 1472 1473 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1474 { 1475 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1476 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1477 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1478 thrust::device_ptr<PetscScalar> zptr; 1479 const PetscScalar *xarray; 1480 PetscScalar *zarray; 1481 PetscErrorCode ierr; 1482 cusparseStatus_t stat; 1483 1484 PetscFunctionBegin; 1485 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1486 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1487 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1488 if (!matstructT) { 1489 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1490 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1491 } 1492 1493 try { 1494 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1495 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1496 ierr = VecCUDAGetArrayReadWrite(zz,&zarray);CHKERRQ(ierr); 1497 zptr = thrust::device_pointer_cast(zarray); 1498 1499 /* multiply add with matrix transpose */ 1500 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1501 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1502 /* here we need to be careful to set the number of rows in the multiply to the 1503 number of compressed rows in the matrix ... which is equivalent to the 1504 size of the workVector */ 1505 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1506 mat->num_rows, mat->num_cols, 1507 mat->num_entries, matstructT->alpha, matstructT->descr, 1508 mat->values->data().get(), mat->row_offsets->data().get(), 1509 mat->column_indices->data().get(), xarray, matstructT->beta, 1510 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1511 } else { 1512 #if CUDA_VERSION>=4020 1513 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1514 if (cusparsestruct->workVector->size()) { 1515 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1516 matstructT->alpha, matstructT->descr, hybMat, 1517 xarray, matstructT->beta, 1518 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1519 } 1520 #endif 1521 } 1522 1523 /* scatter the data from the temporary into the full vector with a += operation */ 1524 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))), 1525 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n, 1526 VecCUDAPlusEquals()); 1527 1528 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1529 ierr = VecCUDARestoreArrayReadWrite(zz,&zarray);CHKERRQ(ierr); 1530 1531 } catch(char *ex) { 1532 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1533 } 1534 ierr = WaitForGPU();CHKERRCUDA(ierr); 1535 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1536 PetscFunctionReturn(0); 1537 } 1538 1539 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1540 { 1541 PetscErrorCode ierr; 1542 1543 PetscFunctionBegin; 1544 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1545 if (A->factortype==MAT_FACTOR_NONE) { 1546 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1547 } 1548 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1549 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1550 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1551 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1552 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1553 PetscFunctionReturn(0); 1554 } 1555 1556 /* --------------------------------------------------------------------------------*/ 1557 /*@ 1558 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1559 (the default parallel PETSc format). This matrix will ultimately pushed down 1560 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1561 assembly performance the user should preallocate the matrix storage by setting 1562 the parameter nz (or the array nnz). By setting these parameters accurately, 1563 performance during matrix assembly can be increased by more than a factor of 50. 1564 1565 Collective on MPI_Comm 1566 1567 Input Parameters: 1568 + comm - MPI communicator, set to PETSC_COMM_SELF 1569 . m - number of rows 1570 . n - number of columns 1571 . nz - number of nonzeros per row (same for all rows) 1572 - nnz - array containing the number of nonzeros in the various rows 1573 (possibly different for each row) or NULL 1574 1575 Output Parameter: 1576 . A - the matrix 1577 1578 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1579 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1580 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1581 1582 Notes: 1583 If nnz is given then nz is ignored 1584 1585 The AIJ format (also called the Yale sparse matrix format or 1586 compressed row storage), is fully compatible with standard Fortran 77 1587 storage. That is, the stored row and column indices can begin at 1588 either one (as in Fortran) or zero. See the users' manual for details. 1589 1590 Specify the preallocated storage with either nz or nnz (not both). 1591 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1592 allocation. For large problems you MUST preallocate memory or you 1593 will get TERRIBLE performance, see the users' manual chapter on matrices. 1594 1595 By default, this format uses inodes (identical nodes) when possible, to 1596 improve numerical efficiency of matrix-vector products and solves. We 1597 search for consecutive rows with the same nonzero structure, thereby 1598 reusing matrix information to achieve increased efficiency. 1599 1600 Level: intermediate 1601 1602 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1603 @*/ 1604 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1605 { 1606 PetscErrorCode ierr; 1607 1608 PetscFunctionBegin; 1609 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1610 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1611 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1612 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1613 PetscFunctionReturn(0); 1614 } 1615 1616 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1617 { 1618 PetscErrorCode ierr; 1619 1620 PetscFunctionBegin; 1621 if (A->factortype==MAT_FACTOR_NONE) { 1622 if (A->valid_GPU_matrix != PETSC_OFFLOAD_UNALLOCATED) { 1623 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 1624 } 1625 } else { 1626 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 1627 } 1628 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1629 PetscFunctionReturn(0); 1630 } 1631 1632 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 1633 { 1634 PetscErrorCode ierr; 1635 Mat C; 1636 cusparseStatus_t stat; 1637 cusparseHandle_t handle=0; 1638 1639 PetscFunctionBegin; 1640 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 1641 C = *B; 1642 ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr); 1643 ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr); 1644 1645 /* inject CUSPARSE-specific stuff */ 1646 if (C->factortype==MAT_FACTOR_NONE) { 1647 /* you cannot check the inode.use flag here since the matrix was just created. 1648 now build a GPU matrix data structure */ 1649 C->spptr = new Mat_SeqAIJCUSPARSE; 1650 ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat = 0; 1651 ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0; 1652 ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector = 0; 1653 ((Mat_SeqAIJCUSPARSE*)C->spptr)->format = MAT_CUSPARSE_CSR; 1654 ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream = 0; 1655 ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle = 0; 1656 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1657 ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle = handle; 1658 ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream = 0; 1659 ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0; 1660 } else { 1661 /* NEXT, set the pointers to the triangular factors */ 1662 C->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1663 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr = 0; 1664 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr = 0; 1665 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0; 1666 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0; 1667 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices = 0; 1668 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices = 0; 1669 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector = 0; 1670 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = 0; 1671 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1672 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = handle; 1673 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz = 0; 1674 } 1675 1676 C->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1677 C->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1678 C->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1679 C->ops->mult = MatMult_SeqAIJCUSPARSE; 1680 C->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1681 C->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1682 C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1683 C->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1684 1685 ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1686 1687 C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED; 1688 1689 ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1690 PetscFunctionReturn(0); 1691 } 1692 1693 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 1694 { 1695 PetscErrorCode ierr; 1696 cusparseStatus_t stat; 1697 cusparseHandle_t handle=0; 1698 1699 PetscFunctionBegin; 1700 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 1701 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 1702 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 1703 1704 if (B->factortype==MAT_FACTOR_NONE) { 1705 /* you cannot check the inode.use flag here since the matrix was just created. 1706 now build a GPU matrix data structure */ 1707 B->spptr = new Mat_SeqAIJCUSPARSE; 1708 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1709 ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0; 1710 ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0; 1711 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1712 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1713 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = 0; 1714 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1715 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle; 1716 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1717 ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0; 1718 } else { 1719 /* NEXT, set the pointers to the triangular factors */ 1720 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1721 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 1722 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1723 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0; 1724 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0; 1725 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0; 1726 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0; 1727 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0; 1728 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = 0; 1729 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1730 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle; 1731 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0; 1732 } 1733 1734 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1735 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1736 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1737 B->ops->mult = MatMult_SeqAIJCUSPARSE; 1738 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1739 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1740 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1741 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1742 1743 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1744 1745 B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED; 1746 1747 ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1748 PetscFunctionReturn(0); 1749 } 1750 1751 /*MC 1752 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1753 1754 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1755 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 1756 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 1757 1758 Options Database Keys: 1759 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1760 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1761 . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1762 1763 Level: beginner 1764 1765 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1766 M*/ 1767 1768 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 1769 1770 1771 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 1772 { 1773 PetscErrorCode ierr; 1774 1775 PetscFunctionBegin; 1776 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1777 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1778 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1779 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1780 PetscFunctionReturn(0); 1781 } 1782 1783 1784 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 1785 { 1786 cusparseStatus_t stat; 1787 cusparseHandle_t handle; 1788 1789 PetscFunctionBegin; 1790 if (*cusparsestruct) { 1791 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format); 1792 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format); 1793 delete (*cusparsestruct)->workVector; 1794 if (handle = (*cusparsestruct)->handle) { 1795 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1796 } 1797 delete *cusparsestruct; 1798 *cusparsestruct = 0; 1799 } 1800 PetscFunctionReturn(0); 1801 } 1802 1803 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 1804 { 1805 PetscFunctionBegin; 1806 if (*mat) { 1807 delete (*mat)->values; 1808 delete (*mat)->column_indices; 1809 delete (*mat)->row_offsets; 1810 delete *mat; 1811 *mat = 0; 1812 } 1813 PetscFunctionReturn(0); 1814 } 1815 1816 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 1817 { 1818 cusparseStatus_t stat; 1819 PetscErrorCode ierr; 1820 1821 PetscFunctionBegin; 1822 if (*trifactor) { 1823 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); } 1824 if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); } 1825 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 1826 delete *trifactor; 1827 *trifactor = 0; 1828 } 1829 PetscFunctionReturn(0); 1830 } 1831 1832 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 1833 { 1834 CsrMatrix *mat; 1835 cusparseStatus_t stat; 1836 cudaError_t err; 1837 1838 PetscFunctionBegin; 1839 if (*matstruct) { 1840 if ((*matstruct)->mat) { 1841 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 1842 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 1843 stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat); 1844 } else { 1845 mat = (CsrMatrix*)(*matstruct)->mat; 1846 CsrMatrix_Destroy(&mat); 1847 } 1848 } 1849 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); } 1850 delete (*matstruct)->cprowIndices; 1851 if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); } 1852 if ((*matstruct)->beta) { err=cudaFree((*matstruct)->beta);CHKERRCUDA(err); } 1853 delete *matstruct; 1854 *matstruct = 0; 1855 } 1856 PetscFunctionReturn(0); 1857 } 1858 1859 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 1860 { 1861 cusparseHandle_t handle; 1862 cusparseStatus_t stat; 1863 1864 PetscFunctionBegin; 1865 if (*trifactors) { 1866 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr); 1867 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr); 1868 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose); 1869 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose); 1870 delete (*trifactors)->rpermIndices; 1871 delete (*trifactors)->cpermIndices; 1872 delete (*trifactors)->workVector; 1873 if (handle = (*trifactors)->handle) { 1874 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1875 } 1876 delete *trifactors; 1877 *trifactors = 0; 1878 } 1879 PetscFunctionReturn(0); 1880 } 1881 1882