1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_CXX_COMPLEX_FIX 7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 8 9 #include <petscconf.h> 10 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11 #include <../src/mat/impls/sbaij/seq/sbaij.h> 12 #include <../src/vec/vec/impls/dvecimpl.h> 13 #include <petsc/private/vecimpl.h> 14 #undef VecType 15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 19 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 20 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 21 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 22 23 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 24 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 25 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 26 27 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 28 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 29 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 30 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 31 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 32 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 33 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 34 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 35 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 36 37 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 38 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 39 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 40 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**); 41 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 42 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 43 44 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 45 { 46 cusparseStatus_t stat; 47 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 48 49 PetscFunctionBegin; 50 cusparsestruct->stream = stream; 51 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 52 PetscFunctionReturn(0); 53 } 54 55 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 56 { 57 cusparseStatus_t stat; 58 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 59 60 PetscFunctionBegin; 61 if (cusparsestruct->handle != handle) { 62 if (cusparsestruct->handle) { 63 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 64 } 65 cusparsestruct->handle = handle; 66 } 67 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 68 PetscFunctionReturn(0); 69 } 70 71 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 72 { 73 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 74 75 PetscFunctionBegin; 76 if (cusparsestruct->handle) cusparsestruct->handle = 0; 77 PetscFunctionReturn(0); 78 } 79 80 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 81 { 82 PetscFunctionBegin; 83 *type = MATSOLVERCUSPARSE; 84 PetscFunctionReturn(0); 85 } 86 87 /*MC 88 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 89 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 90 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 91 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 92 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 93 algorithms are not recommended. This class does NOT support direct solver operations. 94 95 Level: beginner 96 97 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 98 M*/ 99 100 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 101 { 102 PetscErrorCode ierr; 103 PetscInt n = A->rmap->n; 104 105 PetscFunctionBegin; 106 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 107 (*B)->factortype = ftype; 108 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 109 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 110 111 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 112 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 113 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 114 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 115 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 116 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 117 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 118 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 119 120 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 121 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 122 PetscFunctionReturn(0); 123 } 124 125 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 126 { 127 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 128 129 PetscFunctionBegin; 130 switch (op) { 131 case MAT_CUSPARSE_MULT: 132 cusparsestruct->format = format; 133 break; 134 case MAT_CUSPARSE_ALL: 135 cusparsestruct->format = format; 136 break; 137 default: 138 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 139 } 140 PetscFunctionReturn(0); 141 } 142 143 /*@ 144 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 145 operation. Only the MatMult operation can use different GPU storage formats 146 for MPIAIJCUSPARSE matrices. 147 Not Collective 148 149 Input Parameters: 150 + A - Matrix of type SEQAIJCUSPARSE 151 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 152 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 153 154 Output Parameter: 155 156 Level: intermediate 157 158 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 159 @*/ 160 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 161 { 162 PetscErrorCode ierr; 163 164 PetscFunctionBegin; 165 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 166 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 167 PetscFunctionReturn(0); 168 } 169 170 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 171 { 172 PetscErrorCode ierr; 173 MatCUSPARSEStorageFormat format; 174 PetscBool flg; 175 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 176 177 PetscFunctionBegin; 178 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 179 if (A->factortype==MAT_FACTOR_NONE) { 180 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 181 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 182 if (flg) { 183 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 184 } 185 } 186 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 187 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 188 if (flg) { 189 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 190 } 191 ierr = PetscOptionsTail();CHKERRQ(ierr); 192 PetscFunctionReturn(0); 193 } 194 195 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 196 { 197 PetscErrorCode ierr; 198 199 PetscFunctionBegin; 200 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 201 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 202 PetscFunctionReturn(0); 203 } 204 205 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 206 { 207 PetscErrorCode ierr; 208 209 PetscFunctionBegin; 210 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 211 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 212 PetscFunctionReturn(0); 213 } 214 215 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 216 { 217 PetscErrorCode ierr; 218 219 PetscFunctionBegin; 220 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 221 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 222 PetscFunctionReturn(0); 223 } 224 225 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 226 { 227 PetscErrorCode ierr; 228 229 PetscFunctionBegin; 230 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 231 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 232 PetscFunctionReturn(0); 233 } 234 235 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 236 { 237 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 238 PetscInt n = A->rmap->n; 239 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 240 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 241 cusparseStatus_t stat; 242 const PetscInt *ai = a->i,*aj = a->j,*vi; 243 const MatScalar *aa = a->a,*v; 244 PetscInt *AiLo, *AjLo; 245 PetscScalar *AALo; 246 PetscInt i,nz, nzLower, offset, rowOffset; 247 PetscErrorCode ierr; 248 cudaError_t cerr; 249 250 PetscFunctionBegin; 251 if (!n) PetscFunctionReturn(0); 252 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 253 try { 254 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 255 nzLower=n+ai[n]-ai[1]; 256 257 /* Allocate Space for the lower triangular matrix */ 258 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 259 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 260 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 261 262 /* Fill the lower triangular matrix */ 263 AiLo[0] = (PetscInt) 0; 264 AiLo[n] = nzLower; 265 AjLo[0] = (PetscInt) 0; 266 AALo[0] = (MatScalar) 1.0; 267 v = aa; 268 vi = aj; 269 offset = 1; 270 rowOffset= 1; 271 for (i=1; i<n; i++) { 272 nz = ai[i+1] - ai[i]; 273 /* additional 1 for the term on the diagonal */ 274 AiLo[i] = rowOffset; 275 rowOffset += nz+1; 276 277 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 278 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 279 280 offset += nz; 281 AjLo[offset] = (PetscInt) i; 282 AALo[offset] = (MatScalar) 1.0; 283 offset += 1; 284 285 v += nz; 286 vi += nz; 287 } 288 289 /* allocate space for the triangular factor information */ 290 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 291 292 /* Create the matrix description */ 293 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 294 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 295 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 296 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 297 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 298 299 /* Create the solve analysis information */ 300 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 301 302 /* set the operation */ 303 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 304 305 /* set the matrix */ 306 loTriFactor->csrMat = new CsrMatrix; 307 loTriFactor->csrMat->num_rows = n; 308 loTriFactor->csrMat->num_cols = n; 309 loTriFactor->csrMat->num_entries = nzLower; 310 311 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 312 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 313 314 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 315 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 316 317 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 318 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 319 320 /* perform the solve analysis */ 321 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 322 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 323 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 324 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 325 326 /* assign the pointer. Is this really necessary? */ 327 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 328 329 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 330 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 331 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 332 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 333 } catch(char *ex) { 334 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 335 } 336 } 337 PetscFunctionReturn(0); 338 } 339 340 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 341 { 342 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 343 PetscInt n = A->rmap->n; 344 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 345 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 346 cusparseStatus_t stat; 347 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 348 const MatScalar *aa = a->a,*v; 349 PetscInt *AiUp, *AjUp; 350 PetscScalar *AAUp; 351 PetscInt i,nz, nzUpper, offset; 352 PetscErrorCode ierr; 353 cudaError_t cerr; 354 355 PetscFunctionBegin; 356 if (!n) PetscFunctionReturn(0); 357 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 358 try { 359 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 360 nzUpper = adiag[0]-adiag[n]; 361 362 /* Allocate Space for the upper triangular matrix */ 363 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 364 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 365 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 366 367 /* Fill the upper triangular matrix */ 368 AiUp[0]=(PetscInt) 0; 369 AiUp[n]=nzUpper; 370 offset = nzUpper; 371 for (i=n-1; i>=0; i--) { 372 v = aa + adiag[i+1] + 1; 373 vi = aj + adiag[i+1] + 1; 374 375 /* number of elements NOT on the diagonal */ 376 nz = adiag[i] - adiag[i+1]-1; 377 378 /* decrement the offset */ 379 offset -= (nz+1); 380 381 /* first, set the diagonal elements */ 382 AjUp[offset] = (PetscInt) i; 383 AAUp[offset] = (MatScalar)1./v[nz]; 384 AiUp[i] = AiUp[i+1] - (nz+1); 385 386 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 387 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 388 } 389 390 /* allocate space for the triangular factor information */ 391 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 392 393 /* Create the matrix description */ 394 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 395 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 396 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 397 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 398 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 399 400 /* Create the solve analysis information */ 401 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 402 403 /* set the operation */ 404 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 405 406 /* set the matrix */ 407 upTriFactor->csrMat = new CsrMatrix; 408 upTriFactor->csrMat->num_rows = n; 409 upTriFactor->csrMat->num_cols = n; 410 upTriFactor->csrMat->num_entries = nzUpper; 411 412 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 413 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 414 415 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 416 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 417 418 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 419 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 420 421 /* perform the solve analysis */ 422 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 423 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 424 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 425 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 426 427 /* assign the pointer. Is this really necessary? */ 428 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 429 430 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 431 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 432 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 433 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 434 } catch(char *ex) { 435 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 436 } 437 } 438 PetscFunctionReturn(0); 439 } 440 441 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 442 { 443 PetscErrorCode ierr; 444 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 445 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 446 IS isrow = a->row,iscol = a->icol; 447 PetscBool row_identity,col_identity; 448 const PetscInt *r,*c; 449 PetscInt n = A->rmap->n; 450 451 PetscFunctionBegin; 452 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 453 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 454 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 455 456 cusparseTriFactors->workVector = new THRUSTARRAY(n); 457 cusparseTriFactors->nnz=a->nz; 458 459 A->offloadmask = PETSC_OFFLOAD_BOTH; 460 /* lower triangular indices */ 461 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 462 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 463 if (!row_identity) { 464 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 465 cusparseTriFactors->rpermIndices->assign(r, r+n); 466 } 467 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 468 469 /* upper triangular indices */ 470 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 471 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 472 if (!col_identity) { 473 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 474 cusparseTriFactors->cpermIndices->assign(c, c+n); 475 } 476 477 if (!row_identity && !col_identity) { 478 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 479 } else if(!row_identity) { 480 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 481 } else if(!col_identity) { 482 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 483 } 484 485 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 486 PetscFunctionReturn(0); 487 } 488 489 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 490 { 491 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 492 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 493 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 494 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 495 cusparseStatus_t stat; 496 PetscErrorCode ierr; 497 cudaError_t cerr; 498 PetscInt *AiUp, *AjUp; 499 PetscScalar *AAUp; 500 PetscScalar *AALo; 501 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 502 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 503 const PetscInt *ai = b->i,*aj = b->j,*vj; 504 const MatScalar *aa = b->a,*v; 505 506 PetscFunctionBegin; 507 if (!n) PetscFunctionReturn(0); 508 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 509 try { 510 /* Allocate Space for the upper triangular matrix */ 511 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 512 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 513 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 514 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 515 516 /* Fill the upper triangular matrix */ 517 AiUp[0]=(PetscInt) 0; 518 AiUp[n]=nzUpper; 519 offset = 0; 520 for (i=0; i<n; i++) { 521 /* set the pointers */ 522 v = aa + ai[i]; 523 vj = aj + ai[i]; 524 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 525 526 /* first, set the diagonal elements */ 527 AjUp[offset] = (PetscInt) i; 528 AAUp[offset] = (MatScalar)1.0/v[nz]; 529 AiUp[i] = offset; 530 AALo[offset] = (MatScalar)1.0/v[nz]; 531 532 offset+=1; 533 if (nz>0) { 534 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 535 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 536 for (j=offset; j<offset+nz; j++) { 537 AAUp[j] = -AAUp[j]; 538 AALo[j] = AAUp[j]/v[nz]; 539 } 540 offset+=nz; 541 } 542 } 543 544 /* allocate space for the triangular factor information */ 545 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 546 547 /* Create the matrix description */ 548 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 549 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 550 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 551 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 552 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 553 554 /* Create the solve analysis information */ 555 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 556 557 /* set the operation */ 558 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 559 560 /* set the matrix */ 561 upTriFactor->csrMat = new CsrMatrix; 562 upTriFactor->csrMat->num_rows = A->rmap->n; 563 upTriFactor->csrMat->num_cols = A->cmap->n; 564 upTriFactor->csrMat->num_entries = a->nz; 565 566 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 567 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 568 569 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 570 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 571 572 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 573 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 574 575 /* perform the solve analysis */ 576 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 577 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 578 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 579 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 580 581 /* assign the pointer. Is this really necessary? */ 582 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 583 584 /* allocate space for the triangular factor information */ 585 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 586 587 /* Create the matrix description */ 588 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 589 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 590 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 591 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 592 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 593 594 /* Create the solve analysis information */ 595 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 596 597 /* set the operation */ 598 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 599 600 /* set the matrix */ 601 loTriFactor->csrMat = new CsrMatrix; 602 loTriFactor->csrMat->num_rows = A->rmap->n; 603 loTriFactor->csrMat->num_cols = A->cmap->n; 604 loTriFactor->csrMat->num_entries = a->nz; 605 606 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 607 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 608 609 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 610 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 611 612 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 613 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 614 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 615 616 /* perform the solve analysis */ 617 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 618 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 619 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 620 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 621 622 /* assign the pointer. Is this really necessary? */ 623 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 624 625 A->offloadmask = PETSC_OFFLOAD_BOTH; 626 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 627 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 628 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 629 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 630 } catch(char *ex) { 631 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 632 } 633 } 634 PetscFunctionReturn(0); 635 } 636 637 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 638 { 639 PetscErrorCode ierr; 640 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 641 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 642 IS ip = a->row; 643 const PetscInt *rip; 644 PetscBool perm_identity; 645 PetscInt n = A->rmap->n; 646 647 PetscFunctionBegin; 648 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 649 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 650 cusparseTriFactors->workVector = new THRUSTARRAY(n); 651 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 652 653 /* lower triangular indices */ 654 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 655 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 656 if (!perm_identity) { 657 IS iip; 658 const PetscInt *irip; 659 660 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 661 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 662 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 663 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 664 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 665 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 666 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 667 ierr = ISDestroy(&iip);CHKERRQ(ierr); 668 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 669 } 670 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 671 PetscFunctionReturn(0); 672 } 673 674 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 675 { 676 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 677 IS isrow = b->row,iscol = b->col; 678 PetscBool row_identity,col_identity; 679 PetscErrorCode ierr; 680 681 PetscFunctionBegin; 682 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 683 B->offloadmask = PETSC_OFFLOAD_CPU; 684 /* determine which version of MatSolve needs to be used. */ 685 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 686 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 687 if (row_identity && col_identity) { 688 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 689 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 690 B->ops->matsolve = NULL; 691 B->ops->matsolvetranspose = NULL; 692 } else { 693 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 694 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 695 B->ops->matsolve = NULL; 696 B->ops->matsolvetranspose = NULL; 697 } 698 699 /* get the triangular factors */ 700 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 701 PetscFunctionReturn(0); 702 } 703 704 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 705 { 706 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 707 IS ip = b->row; 708 PetscBool perm_identity; 709 PetscErrorCode ierr; 710 711 PetscFunctionBegin; 712 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 713 B->offloadmask = PETSC_OFFLOAD_CPU; 714 /* determine which version of MatSolve needs to be used. */ 715 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 716 if (perm_identity) { 717 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 718 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 719 B->ops->matsolve = NULL; 720 B->ops->matsolvetranspose = NULL; 721 } else { 722 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 723 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 724 B->ops->matsolve = NULL; 725 B->ops->matsolvetranspose = NULL; 726 } 727 728 /* get the triangular factors */ 729 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 730 PetscFunctionReturn(0); 731 } 732 733 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 734 { 735 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 736 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 737 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 738 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 739 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 740 cusparseStatus_t stat; 741 cusparseIndexBase_t indexBase; 742 cusparseMatrixType_t matrixType; 743 cusparseFillMode_t fillMode; 744 cusparseDiagType_t diagType; 745 746 PetscFunctionBegin; 747 748 /*********************************************/ 749 /* Now the Transpose of the Lower Tri Factor */ 750 /*********************************************/ 751 752 /* allocate space for the transpose of the lower triangular factor */ 753 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 754 755 /* set the matrix descriptors of the lower triangular factor */ 756 matrixType = cusparseGetMatType(loTriFactor->descr); 757 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 758 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 759 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 760 diagType = cusparseGetMatDiagType(loTriFactor->descr); 761 762 /* Create the matrix description */ 763 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 764 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 765 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 766 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 767 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 768 769 /* Create the solve analysis information */ 770 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 771 772 /* set the operation */ 773 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 774 775 /* allocate GPU space for the CSC of the lower triangular factor*/ 776 loTriFactorT->csrMat = new CsrMatrix; 777 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 778 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 779 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 780 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 781 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 782 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 783 784 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 785 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 786 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 787 loTriFactor->csrMat->values->data().get(), 788 loTriFactor->csrMat->row_offsets->data().get(), 789 loTriFactor->csrMat->column_indices->data().get(), 790 loTriFactorT->csrMat->values->data().get(), 791 loTriFactorT->csrMat->column_indices->data().get(), 792 loTriFactorT->csrMat->row_offsets->data().get(), 793 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 794 795 /* perform the solve analysis on the transposed matrix */ 796 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 797 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 798 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 799 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 800 loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 801 802 /* assign the pointer. Is this really necessary? */ 803 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 804 805 /*********************************************/ 806 /* Now the Transpose of the Upper Tri Factor */ 807 /*********************************************/ 808 809 /* allocate space for the transpose of the upper triangular factor */ 810 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 811 812 /* set the matrix descriptors of the upper triangular factor */ 813 matrixType = cusparseGetMatType(upTriFactor->descr); 814 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 815 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 816 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 817 diagType = cusparseGetMatDiagType(upTriFactor->descr); 818 819 /* Create the matrix description */ 820 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 821 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 822 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 823 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 824 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 825 826 /* Create the solve analysis information */ 827 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 828 829 /* set the operation */ 830 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 831 832 /* allocate GPU space for the CSC of the upper triangular factor*/ 833 upTriFactorT->csrMat = new CsrMatrix; 834 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 835 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 836 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 837 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 838 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 839 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 840 841 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 842 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 843 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 844 upTriFactor->csrMat->values->data().get(), 845 upTriFactor->csrMat->row_offsets->data().get(), 846 upTriFactor->csrMat->column_indices->data().get(), 847 upTriFactorT->csrMat->values->data().get(), 848 upTriFactorT->csrMat->column_indices->data().get(), 849 upTriFactorT->csrMat->row_offsets->data().get(), 850 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 851 852 /* perform the solve analysis on the transposed matrix */ 853 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 854 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 855 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 856 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 857 upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 858 859 /* assign the pointer. Is this really necessary? */ 860 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 861 PetscFunctionReturn(0); 862 } 863 864 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 865 { 866 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 867 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 868 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 869 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 870 cusparseStatus_t stat; 871 cusparseIndexBase_t indexBase; 872 cudaError_t err; 873 PetscErrorCode ierr; 874 875 PetscFunctionBegin; 876 877 /* allocate space for the triangular factor information */ 878 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 879 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 880 indexBase = cusparseGetMatIndexBase(matstruct->descr); 881 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 882 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 883 884 /* set alpha and beta */ 885 err = cudaMalloc((void **)&(matstructT->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 886 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 887 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 888 err = cudaMemcpy(matstructT->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 889 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 890 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 891 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 892 893 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 894 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 895 CsrMatrix *matrixT= new CsrMatrix; 896 matrixT->num_rows = A->cmap->n; 897 matrixT->num_cols = A->rmap->n; 898 matrixT->num_entries = a->nz; 899 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 900 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 901 matrixT->values = new THRUSTARRAY(a->nz); 902 903 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 904 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 905 /* compute the transpose, i.e. the CSC */ 906 indexBase = cusparseGetMatIndexBase(matstruct->descr); 907 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 908 A->cmap->n, matrix->num_entries, 909 matrix->values->data().get(), 910 cusparsestruct->rowoffsets_gpu->data().get(), 911 matrix->column_indices->data().get(), 912 matrixT->values->data().get(), 913 matrixT->column_indices->data().get(), 914 matrixT->row_offsets->data().get(), 915 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 916 /* assign the pointer */ 917 matstructT->mat = matrixT; 918 ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 919 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 920 /* First convert HYB to CSR */ 921 CsrMatrix *temp= new CsrMatrix; 922 temp->num_rows = A->rmap->n; 923 temp->num_cols = A->cmap->n; 924 temp->num_entries = a->nz; 925 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 926 temp->column_indices = new THRUSTINTARRAY32(a->nz); 927 temp->values = new THRUSTARRAY(a->nz); 928 929 930 stat = cusparse_hyb2csr(cusparsestruct->handle, 931 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 932 temp->values->data().get(), 933 temp->row_offsets->data().get(), 934 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 935 936 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 937 CsrMatrix *tempT= new CsrMatrix; 938 tempT->num_rows = A->rmap->n; 939 tempT->num_cols = A->cmap->n; 940 tempT->num_entries = a->nz; 941 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 942 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 943 tempT->values = new THRUSTARRAY(a->nz); 944 945 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 946 temp->num_cols, temp->num_entries, 947 temp->values->data().get(), 948 temp->row_offsets->data().get(), 949 temp->column_indices->data().get(), 950 tempT->values->data().get(), 951 tempT->column_indices->data().get(), 952 tempT->row_offsets->data().get(), 953 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 954 955 /* Last, convert CSC to HYB */ 956 cusparseHybMat_t hybMat; 957 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 958 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 959 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 960 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 961 matstructT->descr, tempT->values->data().get(), 962 tempT->row_offsets->data().get(), 963 tempT->column_indices->data().get(), 964 hybMat, 0, partition);CHKERRCUSPARSE(stat); 965 966 /* assign the pointer */ 967 matstructT->mat = hybMat; 968 ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr); 969 970 /* delete temporaries */ 971 if (tempT) { 972 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 973 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 974 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 975 delete (CsrMatrix*) tempT; 976 } 977 if (temp) { 978 if (temp->values) delete (THRUSTARRAY*) temp->values; 979 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 980 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 981 delete (CsrMatrix*) temp; 982 } 983 } 984 /* the compressed row indices is not used for matTranspose */ 985 matstructT->cprowIndices = NULL; 986 /* assign the pointer */ 987 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 988 PetscFunctionReturn(0); 989 } 990 991 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 992 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 993 { 994 PetscInt n = xx->map->n; 995 const PetscScalar *barray; 996 PetscScalar *xarray; 997 thrust::device_ptr<const PetscScalar> bGPU; 998 thrust::device_ptr<PetscScalar> xGPU; 999 cusparseStatus_t stat; 1000 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1001 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1002 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1003 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1004 PetscErrorCode ierr; 1005 cudaError_t cerr; 1006 1007 PetscFunctionBegin; 1008 /* Analyze the matrix and create the transpose ... on the fly */ 1009 if (!loTriFactorT && !upTriFactorT) { 1010 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1011 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1012 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1013 } 1014 1015 /* Get the GPU pointers */ 1016 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1017 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1018 xGPU = thrust::device_pointer_cast(xarray); 1019 bGPU = thrust::device_pointer_cast(barray); 1020 1021 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1022 /* First, reorder with the row permutation */ 1023 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1024 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1025 xGPU); 1026 1027 /* First, solve U */ 1028 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1029 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1030 upTriFactorT->csrMat->values->data().get(), 1031 upTriFactorT->csrMat->row_offsets->data().get(), 1032 upTriFactorT->csrMat->column_indices->data().get(), 1033 upTriFactorT->solveInfo, 1034 xarray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1035 1036 /* Then, solve L */ 1037 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1038 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1039 loTriFactorT->csrMat->values->data().get(), 1040 loTriFactorT->csrMat->row_offsets->data().get(), 1041 loTriFactorT->csrMat->column_indices->data().get(), 1042 loTriFactorT->solveInfo, 1043 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1044 1045 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1046 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1047 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1048 tempGPU->begin()); 1049 1050 /* Copy the temporary to the full solution. */ 1051 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1052 1053 /* restore */ 1054 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1055 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1056 cerr = WaitForGPU();CHKERRCUDA(cerr); 1057 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1058 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1059 PetscFunctionReturn(0); 1060 } 1061 1062 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1063 { 1064 const PetscScalar *barray; 1065 PetscScalar *xarray; 1066 cusparseStatus_t stat; 1067 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1068 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1069 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1070 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1071 PetscErrorCode ierr; 1072 cudaError_t cerr; 1073 1074 PetscFunctionBegin; 1075 /* Analyze the matrix and create the transpose ... on the fly */ 1076 if (!loTriFactorT && !upTriFactorT) { 1077 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1078 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1079 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1080 } 1081 1082 /* Get the GPU pointers */ 1083 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1084 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1085 1086 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1087 /* First, solve U */ 1088 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1089 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1090 upTriFactorT->csrMat->values->data().get(), 1091 upTriFactorT->csrMat->row_offsets->data().get(), 1092 upTriFactorT->csrMat->column_indices->data().get(), 1093 upTriFactorT->solveInfo, 1094 barray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1095 1096 /* Then, solve L */ 1097 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1098 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1099 loTriFactorT->csrMat->values->data().get(), 1100 loTriFactorT->csrMat->row_offsets->data().get(), 1101 loTriFactorT->csrMat->column_indices->data().get(), 1102 loTriFactorT->solveInfo, 1103 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1104 1105 /* restore */ 1106 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1107 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1108 cerr = WaitForGPU();CHKERRCUDA(cerr); 1109 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1110 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1111 PetscFunctionReturn(0); 1112 } 1113 1114 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1115 { 1116 const PetscScalar *barray; 1117 PetscScalar *xarray; 1118 thrust::device_ptr<const PetscScalar> bGPU; 1119 thrust::device_ptr<PetscScalar> xGPU; 1120 cusparseStatus_t stat; 1121 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1122 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1123 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1124 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1125 PetscErrorCode ierr; 1126 cudaError_t cerr; 1127 1128 PetscFunctionBegin; 1129 1130 /* Get the GPU pointers */ 1131 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1132 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1133 xGPU = thrust::device_pointer_cast(xarray); 1134 bGPU = thrust::device_pointer_cast(barray); 1135 1136 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1137 /* First, reorder with the row permutation */ 1138 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1139 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1140 tempGPU->begin()); 1141 1142 /* Next, solve L */ 1143 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1144 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1145 loTriFactor->csrMat->values->data().get(), 1146 loTriFactor->csrMat->row_offsets->data().get(), 1147 loTriFactor->csrMat->column_indices->data().get(), 1148 loTriFactor->solveInfo, 1149 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1150 1151 /* Then, solve U */ 1152 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1153 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1154 upTriFactor->csrMat->values->data().get(), 1155 upTriFactor->csrMat->row_offsets->data().get(), 1156 upTriFactor->csrMat->column_indices->data().get(), 1157 upTriFactor->solveInfo, 1158 xarray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1159 1160 /* Last, reorder with the column permutation */ 1161 thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1162 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1163 xGPU); 1164 1165 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1166 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1167 cerr = WaitForGPU();CHKERRCUDA(cerr); 1168 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1169 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1170 PetscFunctionReturn(0); 1171 } 1172 1173 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1174 { 1175 const PetscScalar *barray; 1176 PetscScalar *xarray; 1177 cusparseStatus_t stat; 1178 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1179 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1180 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1181 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1182 PetscErrorCode ierr; 1183 cudaError_t cerr; 1184 1185 PetscFunctionBegin; 1186 /* Get the GPU pointers */ 1187 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1188 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1189 1190 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1191 /* First, solve L */ 1192 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1193 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1194 loTriFactor->csrMat->values->data().get(), 1195 loTriFactor->csrMat->row_offsets->data().get(), 1196 loTriFactor->csrMat->column_indices->data().get(), 1197 loTriFactor->solveInfo, 1198 barray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1199 1200 /* Next, solve U */ 1201 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1202 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1203 upTriFactor->csrMat->values->data().get(), 1204 upTriFactor->csrMat->row_offsets->data().get(), 1205 upTriFactor->csrMat->column_indices->data().get(), 1206 upTriFactor->solveInfo, 1207 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1208 1209 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1210 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1211 cerr = WaitForGPU();CHKERRCUDA(cerr); 1212 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1213 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1214 PetscFunctionReturn(0); 1215 } 1216 1217 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1218 { 1219 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1220 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1221 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1222 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1223 PetscErrorCode ierr; 1224 cusparseStatus_t stat; 1225 cudaError_t err; 1226 1227 PetscFunctionBegin; 1228 if (A->boundtocpu) PetscFunctionReturn(0); 1229 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1230 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1231 if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1232 /* Copy values only */ 1233 CsrMatrix *mat,*matT; 1234 mat = (CsrMatrix*)cusparsestruct->mat->mat; 1235 mat->values->assign(a->a, a->a+a->nz); 1236 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1237 1238 /* Update matT when it was built before */ 1239 if (cusparsestruct->matTranspose) { 1240 cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr); 1241 matT = (CsrMatrix*)cusparsestruct->matTranspose->mat; 1242 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1243 A->cmap->n, mat->num_entries, 1244 mat->values->data().get(), 1245 cusparsestruct->rowoffsets_gpu->data().get(), 1246 mat->column_indices->data().get(), 1247 matT->values->data().get(), 1248 matT->column_indices->data().get(), 1249 matT->row_offsets->data().get(), 1250 CUSPARSE_ACTION_NUMERIC,indexBase);CHKERRCUSPARSE(stat); 1251 } 1252 } else { 1253 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1254 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr); 1255 delete cusparsestruct->workVector; 1256 delete cusparsestruct->rowoffsets_gpu; 1257 try { 1258 if (a->compressedrow.use) { 1259 m = a->compressedrow.nrows; 1260 ii = a->compressedrow.i; 1261 ridx = a->compressedrow.rindex; 1262 } else { 1263 m = A->rmap->n; 1264 ii = a->i; 1265 ridx = NULL; /* In this case, one should not dereference ridx */ 1266 } 1267 cusparsestruct->nrows = m; 1268 1269 /* allocate space for the triangular factor information */ 1270 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1271 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1272 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1273 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1274 1275 err = cudaMalloc((void **)&(matstruct->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 1276 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1277 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1278 err = cudaMemcpy(matstruct->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1279 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1280 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1281 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1282 1283 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1284 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1285 /* set the matrix */ 1286 CsrMatrix *matrix= new CsrMatrix; 1287 matrix->num_rows = m; 1288 matrix->num_cols = A->cmap->n; 1289 matrix->num_entries = a->nz; 1290 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1291 matrix->row_offsets->assign(ii, ii + m+1); 1292 1293 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1294 matrix->column_indices->assign(a->j, a->j+a->nz); 1295 1296 matrix->values = new THRUSTARRAY(a->nz); 1297 matrix->values->assign(a->a, a->a+a->nz); 1298 1299 /* assign the pointer */ 1300 matstruct->mat = matrix; 1301 1302 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1303 CsrMatrix *matrix= new CsrMatrix; 1304 matrix->num_rows = m; 1305 matrix->num_cols = A->cmap->n; 1306 matrix->num_entries = a->nz; 1307 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1308 matrix->row_offsets->assign(ii, ii + m+1); 1309 1310 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1311 matrix->column_indices->assign(a->j, a->j+a->nz); 1312 1313 matrix->values = new THRUSTARRAY(a->nz); 1314 matrix->values->assign(a->a, a->a+a->nz); 1315 1316 cusparseHybMat_t hybMat; 1317 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1318 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1319 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1320 stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1321 matstruct->descr, matrix->values->data().get(), 1322 matrix->row_offsets->data().get(), 1323 matrix->column_indices->data().get(), 1324 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1325 /* assign the pointer */ 1326 matstruct->mat = hybMat; 1327 1328 if (matrix) { 1329 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1330 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1331 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1332 delete (CsrMatrix*)matrix; 1333 } 1334 } 1335 1336 /* assign the compressed row indices */ 1337 if (a->compressedrow.use) { 1338 cusparsestruct->workVector = new THRUSTARRAY(m); 1339 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1340 matstruct->cprowIndices->assign(ridx,ridx+m); 1341 tmp = m; 1342 } else { 1343 cusparsestruct->workVector = NULL; 1344 matstruct->cprowIndices = NULL; 1345 tmp = 0; 1346 } 1347 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1348 1349 /* assign the pointer */ 1350 cusparsestruct->mat = matstruct; 1351 } catch(char *ex) { 1352 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1353 } 1354 cusparsestruct->nonzerostate = A->nonzerostate; 1355 } 1356 err = WaitForGPU();CHKERRCUDA(err); 1357 A->offloadmask = PETSC_OFFLOAD_BOTH; 1358 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1359 } 1360 PetscFunctionReturn(0); 1361 } 1362 1363 struct VecCUDAPlusEquals 1364 { 1365 template <typename Tuple> 1366 __host__ __device__ 1367 void operator()(Tuple t) 1368 { 1369 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1370 } 1371 }; 1372 1373 typedef struct { 1374 PetscBool cisdense; 1375 PetscScalar *Bt; 1376 Mat X; 1377 } MatMatCusparse; 1378 1379 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1380 { 1381 PetscErrorCode ierr; 1382 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1383 cudaError_t cerr; 1384 1385 PetscFunctionBegin; 1386 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1387 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1388 ierr = PetscFree(data);CHKERRQ(ierr); 1389 PetscFunctionReturn(0); 1390 } 1391 1392 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1393 1394 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1395 { 1396 Mat_Product *product = C->product; 1397 Mat A,B; 1398 PetscInt m,n,k,blda,clda; 1399 PetscBool flg,biscuda; 1400 Mat_SeqAIJCUSPARSE *cusp; 1401 cusparseStatus_t stat; 1402 cusparseOperation_t opA; 1403 cusparseMatDescr_t matA; 1404 const PetscScalar *barray; 1405 PetscScalar *carray; 1406 PetscErrorCode ierr; 1407 MatMatCusparse *mmdata; 1408 Mat_SeqAIJCUSPARSEMultStruct *mat; 1409 CsrMatrix *csrmat; 1410 cudaError_t cuer; 1411 1412 PetscFunctionBegin; 1413 MatCheckProduct(C,1); 1414 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 1415 mmdata = (MatMatCusparse*)product->data; 1416 A = product->A; 1417 B = product->B; 1418 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1419 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1420 /* currently CopyToGpu does not copy if the matrix is bound to CPU 1421 Instead of silently accepting the wrong answer, I prefer to raise the error */ 1422 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 1423 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1424 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1425 switch (product->type) { 1426 case MATPRODUCT_AB: 1427 case MATPRODUCT_PtAP: 1428 mat = cusp->mat; 1429 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1430 m = A->rmap->n; 1431 k = A->cmap->n; 1432 n = B->cmap->n; 1433 break; 1434 case MATPRODUCT_AtB: 1435 if (!cusp->matTranspose) { 1436 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1437 } 1438 mat = cusp->matTranspose; 1439 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1440 m = A->cmap->n; 1441 k = A->rmap->n; 1442 n = B->cmap->n; 1443 break; 1444 case MATPRODUCT_ABt: 1445 case MATPRODUCT_RARt: 1446 mat = cusp->mat; 1447 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1448 m = A->rmap->n; 1449 k = B->cmap->n; 1450 n = B->rmap->n; 1451 break; 1452 default: 1453 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1454 } 1455 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 1456 matA = mat->descr; 1457 csrmat = (CsrMatrix*)mat->mat; 1458 /* if the user passed a CPU matrix, copy the data to the GPU */ 1459 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 1460 if (!biscuda) { 1461 ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 1462 } 1463 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 1464 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 1465 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 1466 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1467 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 1468 } else { 1469 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 1470 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 1471 } 1472 1473 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1474 1475 /* cusparse spmm does not support transpose on B */ 1476 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 1477 cublasHandle_t cublasv2handle; 1478 cublasStatus_t cerr; 1479 1480 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 1481 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 1482 B->cmap->n,B->rmap->n, 1483 &PETSC_CUSPARSE_ONE ,barray,blda, 1484 &PETSC_CUSPARSE_ZERO,barray,blda, 1485 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 1486 blda = B->cmap->n; 1487 } 1488 1489 /* perform the MatMat operation */ 1490 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 1491 csrmat->num_entries,mat->alpha,matA, 1492 csrmat->values->data().get(), 1493 csrmat->row_offsets->data().get(), 1494 csrmat->column_indices->data().get(), 1495 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 1496 carray,clda);CHKERRCUSPARSE(stat); 1497 cuer = WaitForGPU();CHKERRCUDA(cuer); 1498 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1499 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 1500 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 1501 if (product->type == MATPRODUCT_RARt) { 1502 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1503 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 1504 } else if (product->type == MATPRODUCT_PtAP) { 1505 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1506 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 1507 } else { 1508 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 1509 } 1510 if (mmdata->cisdense) { 1511 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 1512 } 1513 if (!biscuda) { 1514 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 1515 } 1516 PetscFunctionReturn(0); 1517 } 1518 1519 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1520 { 1521 Mat_Product *product = C->product; 1522 Mat A,B; 1523 PetscInt m,n; 1524 PetscBool cisdense,flg; 1525 PetscErrorCode ierr; 1526 MatMatCusparse *mmdata; 1527 Mat_SeqAIJCUSPARSE *cusp; 1528 1529 PetscFunctionBegin; 1530 MatCheckProduct(C,1); 1531 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 1532 A = product->A; 1533 B = product->B; 1534 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1535 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1536 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1537 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 1538 switch (product->type) { 1539 case MATPRODUCT_AB: 1540 m = A->rmap->n; 1541 n = B->cmap->n; 1542 break; 1543 case MATPRODUCT_AtB: 1544 m = A->cmap->n; 1545 n = B->cmap->n; 1546 break; 1547 case MATPRODUCT_ABt: 1548 m = A->rmap->n; 1549 n = B->rmap->n; 1550 break; 1551 case MATPRODUCT_PtAP: 1552 m = B->cmap->n; 1553 n = B->cmap->n; 1554 break; 1555 case MATPRODUCT_RARt: 1556 m = B->rmap->n; 1557 n = B->rmap->n; 1558 break; 1559 default: 1560 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1561 } 1562 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 1563 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 1564 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 1565 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 1566 1567 /* product data */ 1568 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 1569 mmdata->cisdense = cisdense; 1570 /* cusparse spmm does not support transpose on B */ 1571 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 1572 cudaError_t cerr; 1573 1574 cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 1575 } 1576 /* for these products we need intermediate storage */ 1577 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 1578 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 1579 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 1580 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 1581 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 1582 } else { 1583 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 1584 } 1585 } 1586 C->product->data = mmdata; 1587 C->product->destroy = MatDestroy_MatMatCusparse; 1588 1589 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 1590 PetscFunctionReturn(0); 1591 } 1592 1593 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 1594 1595 /* handles dense B */ 1596 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C) 1597 { 1598 Mat_Product *product = C->product; 1599 PetscErrorCode ierr; 1600 1601 PetscFunctionBegin; 1602 MatCheckProduct(C,1); 1603 if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A"); 1604 if (product->A->boundtocpu) { 1605 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr); 1606 PetscFunctionReturn(0); 1607 } 1608 switch (product->type) { 1609 case MATPRODUCT_AB: 1610 case MATPRODUCT_AtB: 1611 case MATPRODUCT_ABt: 1612 case MATPRODUCT_PtAP: 1613 case MATPRODUCT_RARt: 1614 C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 1615 default: 1616 break; 1617 } 1618 PetscFunctionReturn(0); 1619 } 1620 1621 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1622 { 1623 PetscErrorCode ierr; 1624 1625 PetscFunctionBegin; 1626 ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr); 1627 PetscFunctionReturn(0); 1628 } 1629 1630 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1631 { 1632 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1633 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1634 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1635 const PetscScalar *xarray; 1636 PetscScalar *yarray; 1637 PetscErrorCode ierr; 1638 cudaError_t cerr; 1639 cusparseStatus_t stat; 1640 1641 PetscFunctionBegin; 1642 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1643 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1644 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1645 if (!matstructT) { 1646 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1647 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1648 } 1649 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1650 ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1651 if (yy->map->n) { 1652 PetscInt n = yy->map->n; 1653 cudaError_t err; 1654 err = cudaMemset(yarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err); /* hack to fix numerical errors from reading output vector yy, apparently */ 1655 } 1656 1657 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1658 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1659 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1660 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1661 mat->num_rows, mat->num_cols, 1662 mat->num_entries, matstructT->alpha, matstructT->descr, 1663 mat->values->data().get(), mat->row_offsets->data().get(), 1664 mat->column_indices->data().get(), xarray, matstructT->beta_zero, 1665 yarray);CHKERRCUSPARSE(stat); 1666 } else { 1667 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1668 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1669 matstructT->alpha, matstructT->descr, hybMat, 1670 xarray, matstructT->beta_zero, 1671 yarray);CHKERRCUSPARSE(stat); 1672 } 1673 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1674 ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1675 cerr = WaitForGPU();CHKERRCUDA(cerr); 1676 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1677 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1678 PetscFunctionReturn(0); 1679 } 1680 1681 1682 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1683 { 1684 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1685 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1686 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 1687 const PetscScalar *xarray; 1688 PetscScalar *zarray,*dptr,*beta; 1689 PetscErrorCode ierr; 1690 cudaError_t cerr; 1691 cusparseStatus_t stat; 1692 PetscBool compressed = a->compressedrow.use; /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 1693 1694 PetscFunctionBegin; 1695 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1696 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1697 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1698 try { 1699 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1700 1701 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 1702 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 1703 1704 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; /* Use a work vector for SpMv result if compressed */ 1705 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 1706 1707 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1708 /* csr_spmv does y = alpha*Ax + beta*y */ 1709 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1710 /* here we need to be careful to set the number of rows in the multiply to the 1711 number of compressed rows in the matrix ... which is equivalent to the 1712 size of the workVector */ 1713 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1714 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1715 mat->num_rows, mat->num_cols, 1716 mat->num_entries, matstruct->alpha, matstruct->descr, 1717 mat->values->data().get(), mat->row_offsets->data().get(), 1718 mat->column_indices->data().get(), xarray, beta, 1719 dptr);CHKERRCUSPARSE(stat); 1720 } else { 1721 if (cusparsestruct->nrows) { 1722 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1723 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1724 matstruct->alpha, matstruct->descr, hybMat, 1725 xarray, beta, 1726 dptr);CHKERRCUSPARSE(stat); 1727 } 1728 } 1729 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1730 1731 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 1732 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 1733 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 1734 } else if (zz != yy) { /* A's not compreseed. zz already contains A*xx, and we just need to add yy */ 1735 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 1736 } 1737 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 1738 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 1739 } 1740 1741 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 1742 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1743 if (compressed) { 1744 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 1745 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 1746 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1747 VecCUDAPlusEquals()); 1748 } 1749 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1750 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1751 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 1752 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 1753 } catch(char *ex) { 1754 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1755 } 1756 cerr = WaitForGPU();CHKERRCUDA(cerr); 1757 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1758 PetscFunctionReturn(0); 1759 } 1760 1761 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1762 { 1763 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1764 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1765 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1766 const PetscScalar *xarray; 1767 PetscScalar *zarray,*beta; 1768 PetscErrorCode ierr; 1769 cudaError_t cerr; 1770 cusparseStatus_t stat; 1771 1772 PetscFunctionBegin; 1773 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1774 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1775 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1776 if (!matstructT) { 1777 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1778 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1779 } 1780 1781 /* Note unlike Mat, MatTranspose uses non-compressed row storage */ 1782 try { 1783 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1784 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1785 ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr); 1786 beta = (yy == zz) ? matstructT->beta_one : matstructT->beta_zero; 1787 1788 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1789 /* multiply add with matrix transpose */ 1790 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1791 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1792 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1793 mat->num_rows, mat->num_cols, 1794 mat->num_entries, matstructT->alpha, matstructT->descr, 1795 mat->values->data().get(), mat->row_offsets->data().get(), 1796 mat->column_indices->data().get(), xarray, beta, 1797 zarray);CHKERRCUSPARSE(stat); 1798 } else { 1799 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1800 if (cusparsestruct->nrows) { 1801 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1802 matstructT->alpha, matstructT->descr, hybMat, 1803 xarray, beta, 1804 zarray);CHKERRCUSPARSE(stat); 1805 } 1806 } 1807 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1808 1809 if (zz != yy) {ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr);} 1810 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1811 ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr); 1812 } catch(char *ex) { 1813 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1814 } 1815 cerr = WaitForGPU();CHKERRCUDA(cerr); 1816 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1817 PetscFunctionReturn(0); 1818 } 1819 1820 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1821 { 1822 PetscErrorCode ierr; 1823 1824 PetscFunctionBegin; 1825 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1826 if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 1827 if (A->factortype == MAT_FACTOR_NONE) { 1828 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1829 } 1830 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1831 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1832 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1833 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1834 PetscFunctionReturn(0); 1835 } 1836 1837 /* --------------------------------------------------------------------------------*/ 1838 /*@ 1839 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1840 (the default parallel PETSc format). This matrix will ultimately pushed down 1841 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1842 assembly performance the user should preallocate the matrix storage by setting 1843 the parameter nz (or the array nnz). By setting these parameters accurately, 1844 performance during matrix assembly can be increased by more than a factor of 50. 1845 1846 Collective 1847 1848 Input Parameters: 1849 + comm - MPI communicator, set to PETSC_COMM_SELF 1850 . m - number of rows 1851 . n - number of columns 1852 . nz - number of nonzeros per row (same for all rows) 1853 - nnz - array containing the number of nonzeros in the various rows 1854 (possibly different for each row) or NULL 1855 1856 Output Parameter: 1857 . A - the matrix 1858 1859 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1860 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1861 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1862 1863 Notes: 1864 If nnz is given then nz is ignored 1865 1866 The AIJ format (also called the Yale sparse matrix format or 1867 compressed row storage), is fully compatible with standard Fortran 77 1868 storage. That is, the stored row and column indices can begin at 1869 either one (as in Fortran) or zero. See the users' manual for details. 1870 1871 Specify the preallocated storage with either nz or nnz (not both). 1872 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1873 allocation. For large problems you MUST preallocate memory or you 1874 will get TERRIBLE performance, see the users' manual chapter on matrices. 1875 1876 By default, this format uses inodes (identical nodes) when possible, to 1877 improve numerical efficiency of matrix-vector products and solves. We 1878 search for consecutive rows with the same nonzero structure, thereby 1879 reusing matrix information to achieve increased efficiency. 1880 1881 Level: intermediate 1882 1883 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1884 @*/ 1885 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1886 { 1887 PetscErrorCode ierr; 1888 1889 PetscFunctionBegin; 1890 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1891 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1892 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1893 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1894 PetscFunctionReturn(0); 1895 } 1896 1897 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1898 { 1899 PetscErrorCode ierr; 1900 1901 PetscFunctionBegin; 1902 if (A->factortype == MAT_FACTOR_NONE) { 1903 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) { 1904 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 1905 } 1906 } else { 1907 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 1908 } 1909 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 1910 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 1911 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 1912 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 1913 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1914 PetscFunctionReturn(0); 1915 } 1916 1917 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 1918 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 1919 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 1920 { 1921 PetscErrorCode ierr; 1922 1923 PetscFunctionBegin; 1924 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 1925 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 1926 PetscFunctionReturn(0); 1927 } 1928 1929 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 1930 { 1931 PetscFunctionBegin; 1932 /* Currently, there is no case in which an AIJCUSPARSE matrix ever has its offloadmask set to PETS_OFFLOAD_GPU. 1933 If this changes, we need to implement a routine to update the CPU (host) version of the matrix from the GPU one. 1934 Right now, for safety we simply check for PETSC_OFFLOAD_GPU and have MatBindToCPU() do nothing in this case. 1935 TODO: Add MatAIJCUSPARSECopyFromGPU() and make MatBindToCPU() functional for AIJCUSPARSE matries; 1936 can follow the example of MatBindToCPU_SeqAIJViennaCL(). */ 1937 /* We need to take care of function composition also */ 1938 if (A->offloadmask == PETSC_OFFLOAD_GPU) PetscFunctionReturn(0); 1939 if (flg) { 1940 A->ops->mult = MatMult_SeqAIJ; 1941 A->ops->multadd = MatMultAdd_SeqAIJ; 1942 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 1943 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 1944 A->ops->assemblyend = MatAssemblyEnd_SeqAIJ; 1945 A->ops->duplicate = MatDuplicate_SeqAIJ; 1946 } else { 1947 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1948 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1949 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1950 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1951 A->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1952 A->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1953 A->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1954 } 1955 A->boundtocpu = flg; 1956 PetscFunctionReturn(0); 1957 } 1958 1959 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 1960 { 1961 PetscErrorCode ierr; 1962 cusparseStatus_t stat; 1963 cusparseHandle_t handle=0; 1964 Mat B; 1965 1966 PetscFunctionBegin; 1967 if (reuse == MAT_INITIAL_MATRIX) { 1968 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 1969 } else if (reuse == MAT_REUSE_MATRIX) { 1970 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 1971 } 1972 B = *newmat; 1973 1974 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 1975 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 1976 1977 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 1978 if (B->factortype == MAT_FACTOR_NONE) { 1979 /* you cannot check the inode.use flag here since the matrix was just created. 1980 now build a GPU matrix data structure */ 1981 B->spptr = new Mat_SeqAIJCUSPARSE; 1982 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1983 ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0; 1984 ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0; 1985 ((Mat_SeqAIJCUSPARSE*)B->spptr)->rowoffsets_gpu = 0; 1986 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1987 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1988 stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat); 1989 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle; 1990 ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0; 1991 } else { 1992 /* NEXT, set the pointers to the triangular factors */ 1993 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1994 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 1995 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1996 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0; 1997 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0; 1998 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0; 1999 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0; 2000 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0; 2001 stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat); 2002 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle; 2003 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0; 2004 } 2005 } 2006 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 2007 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 2008 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 2009 B->ops->mult = MatMult_SeqAIJCUSPARSE; 2010 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 2011 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 2012 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 2013 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 2014 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 2015 2016 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2017 2018 B->boundtocpu = PETSC_FALSE; 2019 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2020 2021 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 2022 ierr = PetscObjectComposeFunction((PetscObject)B,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 2023 ierr = PetscObjectComposeFunction((PetscObject)B,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 2024 PetscFunctionReturn(0); 2025 } 2026 2027 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 2028 { 2029 PetscErrorCode ierr; 2030 2031 PetscFunctionBegin; 2032 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 2033 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2034 PetscFunctionReturn(0); 2035 } 2036 2037 /*MC 2038 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 2039 2040 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 2041 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 2042 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 2043 2044 Options Database Keys: 2045 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 2046 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2047 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2048 2049 Level: beginner 2050 2051 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 2052 M*/ 2053 2054 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 2055 2056 2057 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 2058 { 2059 PetscErrorCode ierr; 2060 2061 PetscFunctionBegin; 2062 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2063 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2064 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2065 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2066 PetscFunctionReturn(0); 2067 } 2068 2069 2070 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 2071 { 2072 cusparseStatus_t stat; 2073 cusparseHandle_t handle; 2074 2075 PetscFunctionBegin; 2076 if (*cusparsestruct) { 2077 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format); 2078 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format); 2079 delete (*cusparsestruct)->workVector; 2080 delete (*cusparsestruct)->rowoffsets_gpu; 2081 if (handle = (*cusparsestruct)->handle) { 2082 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 2083 } 2084 delete *cusparsestruct; 2085 *cusparsestruct = 0; 2086 } 2087 PetscFunctionReturn(0); 2088 } 2089 2090 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 2091 { 2092 PetscFunctionBegin; 2093 if (*mat) { 2094 delete (*mat)->values; 2095 delete (*mat)->column_indices; 2096 delete (*mat)->row_offsets; 2097 delete *mat; 2098 *mat = 0; 2099 } 2100 PetscFunctionReturn(0); 2101 } 2102 2103 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 2104 { 2105 cusparseStatus_t stat; 2106 PetscErrorCode ierr; 2107 2108 PetscFunctionBegin; 2109 if (*trifactor) { 2110 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 2111 if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 2112 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 2113 delete *trifactor; 2114 *trifactor = 0; 2115 } 2116 PetscFunctionReturn(0); 2117 } 2118 2119 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 2120 { 2121 CsrMatrix *mat; 2122 cusparseStatus_t stat; 2123 cudaError_t err; 2124 2125 PetscFunctionBegin; 2126 if (*matstruct) { 2127 if ((*matstruct)->mat) { 2128 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 2129 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 2130 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 2131 } else { 2132 mat = (CsrMatrix*)(*matstruct)->mat; 2133 CsrMatrix_Destroy(&mat); 2134 } 2135 } 2136 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 2137 delete (*matstruct)->cprowIndices; 2138 if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); } 2139 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 2140 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 2141 delete *matstruct; 2142 *matstruct = 0; 2143 } 2144 PetscFunctionReturn(0); 2145 } 2146 2147 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors) 2148 { 2149 PetscFunctionBegin; 2150 if (*trifactors) { 2151 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr); 2152 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr); 2153 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose); 2154 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose); 2155 delete (*trifactors)->rpermIndices; 2156 delete (*trifactors)->cpermIndices; 2157 delete (*trifactors)->workVector; 2158 (*trifactors)->rpermIndices = 0; 2159 (*trifactors)->cpermIndices = 0; 2160 (*trifactors)->workVector = 0; 2161 } 2162 PetscFunctionReturn(0); 2163 } 2164 2165 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 2166 { 2167 cusparseHandle_t handle; 2168 cusparseStatus_t stat; 2169 2170 PetscFunctionBegin; 2171 if (*trifactors) { 2172 MatSeqAIJCUSPARSETriFactors_Reset(trifactors); 2173 if (handle = (*trifactors)->handle) { 2174 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 2175 } 2176 delete *trifactors; 2177 *trifactors = 0; 2178 } 2179 PetscFunctionReturn(0); 2180 } 2181 2182