1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_CXX_COMPLEX_FIX 7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 8 9 #include <petscconf.h> 10 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11 #include <../src/mat/impls/sbaij/seq/sbaij.h> 12 #include <../src/vec/vec/impls/dvecimpl.h> 13 #include <petsc/private/vecimpl.h> 14 #undef VecType 15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 19 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 20 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 21 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 22 23 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 24 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 25 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 26 27 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 28 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 29 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 30 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 31 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 32 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 33 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 34 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 35 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 36 37 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 38 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 39 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 40 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**); 41 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 42 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 43 44 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 45 { 46 cusparseStatus_t stat; 47 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 48 49 PetscFunctionBegin; 50 cusparsestruct->stream = stream; 51 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 52 PetscFunctionReturn(0); 53 } 54 55 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 56 { 57 cusparseStatus_t stat; 58 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 59 60 PetscFunctionBegin; 61 if (cusparsestruct->handle != handle) { 62 if (cusparsestruct->handle) { 63 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 64 } 65 cusparsestruct->handle = handle; 66 } 67 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 68 PetscFunctionReturn(0); 69 } 70 71 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 72 { 73 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 74 75 PetscFunctionBegin; 76 if (cusparsestruct->handle) cusparsestruct->handle = 0; 77 PetscFunctionReturn(0); 78 } 79 80 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 81 { 82 PetscFunctionBegin; 83 *type = MATSOLVERCUSPARSE; 84 PetscFunctionReturn(0); 85 } 86 87 /*MC 88 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 89 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 90 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 91 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 92 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 93 algorithms are not recommended. This class does NOT support direct solver operations. 94 95 Level: beginner 96 97 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 98 M*/ 99 100 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 101 { 102 PetscErrorCode ierr; 103 PetscInt n = A->rmap->n; 104 105 PetscFunctionBegin; 106 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 107 (*B)->factortype = ftype; 108 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 109 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 110 111 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 112 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 113 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 114 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 115 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 116 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 117 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 118 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 119 120 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 121 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 122 PetscFunctionReturn(0); 123 } 124 125 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 126 { 127 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 128 129 PetscFunctionBegin; 130 switch (op) { 131 case MAT_CUSPARSE_MULT: 132 cusparsestruct->format = format; 133 break; 134 case MAT_CUSPARSE_ALL: 135 cusparsestruct->format = format; 136 break; 137 default: 138 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 139 } 140 PetscFunctionReturn(0); 141 } 142 143 /*@ 144 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 145 operation. Only the MatMult operation can use different GPU storage formats 146 for MPIAIJCUSPARSE matrices. 147 Not Collective 148 149 Input Parameters: 150 + A - Matrix of type SEQAIJCUSPARSE 151 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 152 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 153 154 Output Parameter: 155 156 Level: intermediate 157 158 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 159 @*/ 160 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 161 { 162 PetscErrorCode ierr; 163 164 PetscFunctionBegin; 165 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 166 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 167 PetscFunctionReturn(0); 168 } 169 170 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 171 { 172 PetscErrorCode ierr; 173 MatCUSPARSEStorageFormat format; 174 PetscBool flg; 175 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 176 177 PetscFunctionBegin; 178 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 179 if (A->factortype==MAT_FACTOR_NONE) { 180 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 181 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 182 if (flg) { 183 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 184 } 185 } 186 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 187 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 188 if (flg) { 189 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 190 } 191 ierr = PetscOptionsTail();CHKERRQ(ierr); 192 PetscFunctionReturn(0); 193 } 194 195 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 196 { 197 PetscErrorCode ierr; 198 199 PetscFunctionBegin; 200 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 201 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 202 PetscFunctionReturn(0); 203 } 204 205 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 206 { 207 PetscErrorCode ierr; 208 209 PetscFunctionBegin; 210 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 211 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 212 PetscFunctionReturn(0); 213 } 214 215 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 216 { 217 PetscErrorCode ierr; 218 219 PetscFunctionBegin; 220 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 221 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 222 PetscFunctionReturn(0); 223 } 224 225 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 226 { 227 PetscErrorCode ierr; 228 229 PetscFunctionBegin; 230 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 231 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 232 PetscFunctionReturn(0); 233 } 234 235 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 236 { 237 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 238 PetscInt n = A->rmap->n; 239 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 240 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 241 cusparseStatus_t stat; 242 const PetscInt *ai = a->i,*aj = a->j,*vi; 243 const MatScalar *aa = a->a,*v; 244 PetscInt *AiLo, *AjLo; 245 PetscScalar *AALo; 246 PetscInt i,nz, nzLower, offset, rowOffset; 247 PetscErrorCode ierr; 248 cudaError_t cerr; 249 250 PetscFunctionBegin; 251 if (!n) PetscFunctionReturn(0); 252 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 253 try { 254 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 255 nzLower=n+ai[n]-ai[1]; 256 257 /* Allocate Space for the lower triangular matrix */ 258 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 259 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 260 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 261 262 /* Fill the lower triangular matrix */ 263 AiLo[0] = (PetscInt) 0; 264 AiLo[n] = nzLower; 265 AjLo[0] = (PetscInt) 0; 266 AALo[0] = (MatScalar) 1.0; 267 v = aa; 268 vi = aj; 269 offset = 1; 270 rowOffset= 1; 271 for (i=1; i<n; i++) { 272 nz = ai[i+1] - ai[i]; 273 /* additional 1 for the term on the diagonal */ 274 AiLo[i] = rowOffset; 275 rowOffset += nz+1; 276 277 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 278 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 279 280 offset += nz; 281 AjLo[offset] = (PetscInt) i; 282 AALo[offset] = (MatScalar) 1.0; 283 offset += 1; 284 285 v += nz; 286 vi += nz; 287 } 288 289 /* allocate space for the triangular factor information */ 290 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 291 292 /* Create the matrix description */ 293 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 294 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 295 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 296 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 297 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 298 299 /* Create the solve analysis information */ 300 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 301 302 /* set the operation */ 303 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 304 305 /* set the matrix */ 306 loTriFactor->csrMat = new CsrMatrix; 307 loTriFactor->csrMat->num_rows = n; 308 loTriFactor->csrMat->num_cols = n; 309 loTriFactor->csrMat->num_entries = nzLower; 310 311 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 312 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 313 314 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 315 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 316 317 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 318 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 319 320 /* perform the solve analysis */ 321 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 322 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 323 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 324 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 325 326 /* assign the pointer. Is this really necessary? */ 327 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 328 329 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 330 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 331 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 332 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 333 } catch(char *ex) { 334 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 335 } 336 } 337 PetscFunctionReturn(0); 338 } 339 340 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 341 { 342 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 343 PetscInt n = A->rmap->n; 344 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 345 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 346 cusparseStatus_t stat; 347 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 348 const MatScalar *aa = a->a,*v; 349 PetscInt *AiUp, *AjUp; 350 PetscScalar *AAUp; 351 PetscInt i,nz, nzUpper, offset; 352 PetscErrorCode ierr; 353 cudaError_t cerr; 354 355 PetscFunctionBegin; 356 if (!n) PetscFunctionReturn(0); 357 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 358 try { 359 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 360 nzUpper = adiag[0]-adiag[n]; 361 362 /* Allocate Space for the upper triangular matrix */ 363 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 364 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 365 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 366 367 /* Fill the upper triangular matrix */ 368 AiUp[0]=(PetscInt) 0; 369 AiUp[n]=nzUpper; 370 offset = nzUpper; 371 for (i=n-1; i>=0; i--) { 372 v = aa + adiag[i+1] + 1; 373 vi = aj + adiag[i+1] + 1; 374 375 /* number of elements NOT on the diagonal */ 376 nz = adiag[i] - adiag[i+1]-1; 377 378 /* decrement the offset */ 379 offset -= (nz+1); 380 381 /* first, set the diagonal elements */ 382 AjUp[offset] = (PetscInt) i; 383 AAUp[offset] = (MatScalar)1./v[nz]; 384 AiUp[i] = AiUp[i+1] - (nz+1); 385 386 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 387 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 388 } 389 390 /* allocate space for the triangular factor information */ 391 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 392 393 /* Create the matrix description */ 394 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 395 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 396 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 397 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 398 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 399 400 /* Create the solve analysis information */ 401 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 402 403 /* set the operation */ 404 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 405 406 /* set the matrix */ 407 upTriFactor->csrMat = new CsrMatrix; 408 upTriFactor->csrMat->num_rows = n; 409 upTriFactor->csrMat->num_cols = n; 410 upTriFactor->csrMat->num_entries = nzUpper; 411 412 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 413 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 414 415 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 416 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 417 418 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 419 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 420 421 /* perform the solve analysis */ 422 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 423 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 424 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 425 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 426 427 /* assign the pointer. Is this really necessary? */ 428 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 429 430 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 431 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 432 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 433 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 434 } catch(char *ex) { 435 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 436 } 437 } 438 PetscFunctionReturn(0); 439 } 440 441 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 442 { 443 PetscErrorCode ierr; 444 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 445 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 446 IS isrow = a->row,iscol = a->icol; 447 PetscBool row_identity,col_identity; 448 const PetscInt *r,*c; 449 PetscInt n = A->rmap->n; 450 451 PetscFunctionBegin; 452 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 453 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 454 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 455 456 cusparseTriFactors->workVector = new THRUSTARRAY(n); 457 cusparseTriFactors->nnz=a->nz; 458 459 A->offloadmask = PETSC_OFFLOAD_BOTH; 460 /* lower triangular indices */ 461 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 462 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 463 if (!row_identity) { 464 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 465 cusparseTriFactors->rpermIndices->assign(r, r+n); 466 } 467 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 468 469 /* upper triangular indices */ 470 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 471 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 472 if (!col_identity) { 473 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 474 cusparseTriFactors->cpermIndices->assign(c, c+n); 475 } 476 477 if (!row_identity && !col_identity) { 478 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 479 } else if(!row_identity) { 480 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 481 } else if(!col_identity) { 482 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 483 } 484 485 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 486 PetscFunctionReturn(0); 487 } 488 489 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 490 { 491 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 492 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 493 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 494 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 495 cusparseStatus_t stat; 496 PetscErrorCode ierr; 497 cudaError_t cerr; 498 PetscInt *AiUp, *AjUp; 499 PetscScalar *AAUp; 500 PetscScalar *AALo; 501 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 502 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 503 const PetscInt *ai = b->i,*aj = b->j,*vj; 504 const MatScalar *aa = b->a,*v; 505 506 PetscFunctionBegin; 507 if (!n) PetscFunctionReturn(0); 508 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 509 try { 510 /* Allocate Space for the upper triangular matrix */ 511 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 512 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 513 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 514 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 515 516 /* Fill the upper triangular matrix */ 517 AiUp[0]=(PetscInt) 0; 518 AiUp[n]=nzUpper; 519 offset = 0; 520 for (i=0; i<n; i++) { 521 /* set the pointers */ 522 v = aa + ai[i]; 523 vj = aj + ai[i]; 524 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 525 526 /* first, set the diagonal elements */ 527 AjUp[offset] = (PetscInt) i; 528 AAUp[offset] = (MatScalar)1.0/v[nz]; 529 AiUp[i] = offset; 530 AALo[offset] = (MatScalar)1.0/v[nz]; 531 532 offset+=1; 533 if (nz>0) { 534 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 535 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 536 for (j=offset; j<offset+nz; j++) { 537 AAUp[j] = -AAUp[j]; 538 AALo[j] = AAUp[j]/v[nz]; 539 } 540 offset+=nz; 541 } 542 } 543 544 /* allocate space for the triangular factor information */ 545 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 546 547 /* Create the matrix description */ 548 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 549 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 550 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 551 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 552 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 553 554 /* Create the solve analysis information */ 555 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 556 557 /* set the operation */ 558 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 559 560 /* set the matrix */ 561 upTriFactor->csrMat = new CsrMatrix; 562 upTriFactor->csrMat->num_rows = A->rmap->n; 563 upTriFactor->csrMat->num_cols = A->cmap->n; 564 upTriFactor->csrMat->num_entries = a->nz; 565 566 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 567 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 568 569 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 570 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 571 572 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 573 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 574 575 /* perform the solve analysis */ 576 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 577 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 578 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 579 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 580 581 /* assign the pointer. Is this really necessary? */ 582 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 583 584 /* allocate space for the triangular factor information */ 585 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 586 587 /* Create the matrix description */ 588 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 589 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 590 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 591 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 592 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 593 594 /* Create the solve analysis information */ 595 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 596 597 /* set the operation */ 598 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 599 600 /* set the matrix */ 601 loTriFactor->csrMat = new CsrMatrix; 602 loTriFactor->csrMat->num_rows = A->rmap->n; 603 loTriFactor->csrMat->num_cols = A->cmap->n; 604 loTriFactor->csrMat->num_entries = a->nz; 605 606 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 607 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 608 609 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 610 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 611 612 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 613 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 614 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 615 616 /* perform the solve analysis */ 617 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 618 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 619 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 620 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 621 622 /* assign the pointer. Is this really necessary? */ 623 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 624 625 A->offloadmask = PETSC_OFFLOAD_BOTH; 626 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 627 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 628 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 629 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 630 } catch(char *ex) { 631 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 632 } 633 } 634 PetscFunctionReturn(0); 635 } 636 637 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 638 { 639 PetscErrorCode ierr; 640 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 641 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 642 IS ip = a->row; 643 const PetscInt *rip; 644 PetscBool perm_identity; 645 PetscInt n = A->rmap->n; 646 647 PetscFunctionBegin; 648 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 649 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 650 cusparseTriFactors->workVector = new THRUSTARRAY(n); 651 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 652 653 /* lower triangular indices */ 654 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 655 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 656 if (!perm_identity) { 657 IS iip; 658 const PetscInt *irip; 659 660 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 661 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 662 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 663 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 664 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 665 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 666 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 667 ierr = ISDestroy(&iip);CHKERRQ(ierr); 668 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 669 } 670 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 671 PetscFunctionReturn(0); 672 } 673 674 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 675 { 676 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 677 IS isrow = b->row,iscol = b->col; 678 PetscBool row_identity,col_identity; 679 PetscErrorCode ierr; 680 681 PetscFunctionBegin; 682 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 683 B->offloadmask = PETSC_OFFLOAD_CPU; 684 /* determine which version of MatSolve needs to be used. */ 685 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 686 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 687 if (row_identity && col_identity) { 688 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 689 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 690 B->ops->matsolve = NULL; 691 B->ops->matsolvetranspose = NULL; 692 } else { 693 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 694 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 695 B->ops->matsolve = NULL; 696 B->ops->matsolvetranspose = NULL; 697 } 698 699 /* get the triangular factors */ 700 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 701 PetscFunctionReturn(0); 702 } 703 704 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 705 { 706 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 707 IS ip = b->row; 708 PetscBool perm_identity; 709 PetscErrorCode ierr; 710 711 PetscFunctionBegin; 712 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 713 B->offloadmask = PETSC_OFFLOAD_CPU; 714 /* determine which version of MatSolve needs to be used. */ 715 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 716 if (perm_identity) { 717 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 718 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 719 B->ops->matsolve = NULL; 720 B->ops->matsolvetranspose = NULL; 721 } else { 722 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 723 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 724 B->ops->matsolve = NULL; 725 B->ops->matsolvetranspose = NULL; 726 } 727 728 /* get the triangular factors */ 729 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 730 PetscFunctionReturn(0); 731 } 732 733 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 734 { 735 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 736 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 737 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 738 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 739 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 740 cusparseStatus_t stat; 741 cusparseIndexBase_t indexBase; 742 cusparseMatrixType_t matrixType; 743 cusparseFillMode_t fillMode; 744 cusparseDiagType_t diagType; 745 746 PetscFunctionBegin; 747 748 /*********************************************/ 749 /* Now the Transpose of the Lower Tri Factor */ 750 /*********************************************/ 751 752 /* allocate space for the transpose of the lower triangular factor */ 753 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 754 755 /* set the matrix descriptors of the lower triangular factor */ 756 matrixType = cusparseGetMatType(loTriFactor->descr); 757 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 758 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 759 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 760 diagType = cusparseGetMatDiagType(loTriFactor->descr); 761 762 /* Create the matrix description */ 763 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 764 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 765 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 766 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 767 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 768 769 /* Create the solve analysis information */ 770 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 771 772 /* set the operation */ 773 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 774 775 /* allocate GPU space for the CSC of the lower triangular factor*/ 776 loTriFactorT->csrMat = new CsrMatrix; 777 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 778 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 779 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 780 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 781 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 782 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 783 784 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 785 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 786 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 787 loTriFactor->csrMat->values->data().get(), 788 loTriFactor->csrMat->row_offsets->data().get(), 789 loTriFactor->csrMat->column_indices->data().get(), 790 loTriFactorT->csrMat->values->data().get(), 791 loTriFactorT->csrMat->column_indices->data().get(), 792 loTriFactorT->csrMat->row_offsets->data().get(), 793 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 794 795 /* perform the solve analysis on the transposed matrix */ 796 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 797 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 798 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 799 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 800 loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 801 802 /* assign the pointer. Is this really necessary? */ 803 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 804 805 /*********************************************/ 806 /* Now the Transpose of the Upper Tri Factor */ 807 /*********************************************/ 808 809 /* allocate space for the transpose of the upper triangular factor */ 810 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 811 812 /* set the matrix descriptors of the upper triangular factor */ 813 matrixType = cusparseGetMatType(upTriFactor->descr); 814 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 815 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 816 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 817 diagType = cusparseGetMatDiagType(upTriFactor->descr); 818 819 /* Create the matrix description */ 820 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 821 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 822 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 823 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 824 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 825 826 /* Create the solve analysis information */ 827 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 828 829 /* set the operation */ 830 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 831 832 /* allocate GPU space for the CSC of the upper triangular factor*/ 833 upTriFactorT->csrMat = new CsrMatrix; 834 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 835 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 836 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 837 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 838 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 839 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 840 841 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 842 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 843 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 844 upTriFactor->csrMat->values->data().get(), 845 upTriFactor->csrMat->row_offsets->data().get(), 846 upTriFactor->csrMat->column_indices->data().get(), 847 upTriFactorT->csrMat->values->data().get(), 848 upTriFactorT->csrMat->column_indices->data().get(), 849 upTriFactorT->csrMat->row_offsets->data().get(), 850 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 851 852 /* perform the solve analysis on the transposed matrix */ 853 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 854 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 855 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 856 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 857 upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 858 859 /* assign the pointer. Is this really necessary? */ 860 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 861 PetscFunctionReturn(0); 862 } 863 864 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 865 { 866 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 867 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 868 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 869 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 870 cusparseStatus_t stat; 871 cusparseIndexBase_t indexBase; 872 cudaError_t err; 873 PetscErrorCode ierr; 874 875 PetscFunctionBegin; 876 877 /* allocate space for the triangular factor information */ 878 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 879 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 880 indexBase = cusparseGetMatIndexBase(matstruct->descr); 881 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 882 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 883 884 /* set alpha and beta */ 885 err = cudaMalloc((void **)&(matstructT->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 886 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 887 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 888 err = cudaMemcpy(matstructT->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 889 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 890 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 891 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 892 893 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 894 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 895 CsrMatrix *matrixT= new CsrMatrix; 896 matrixT->num_rows = A->cmap->n; 897 matrixT->num_cols = A->rmap->n; 898 matrixT->num_entries = a->nz; 899 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 900 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 901 matrixT->values = new THRUSTARRAY(a->nz); 902 903 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 904 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 905 /* compute the transpose, i.e. the CSC */ 906 indexBase = cusparseGetMatIndexBase(matstruct->descr); 907 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 908 A->cmap->n, matrix->num_entries, 909 matrix->values->data().get(), 910 cusparsestruct->rowoffsets_gpu->data().get(), 911 matrix->column_indices->data().get(), 912 matrixT->values->data().get(), 913 matrixT->column_indices->data().get(), 914 matrixT->row_offsets->data().get(), 915 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 916 /* assign the pointer */ 917 matstructT->mat = matrixT; 918 ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 919 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 920 /* First convert HYB to CSR */ 921 CsrMatrix *temp= new CsrMatrix; 922 temp->num_rows = A->rmap->n; 923 temp->num_cols = A->cmap->n; 924 temp->num_entries = a->nz; 925 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 926 temp->column_indices = new THRUSTINTARRAY32(a->nz); 927 temp->values = new THRUSTARRAY(a->nz); 928 929 930 stat = cusparse_hyb2csr(cusparsestruct->handle, 931 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 932 temp->values->data().get(), 933 temp->row_offsets->data().get(), 934 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 935 936 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 937 CsrMatrix *tempT= new CsrMatrix; 938 tempT->num_rows = A->rmap->n; 939 tempT->num_cols = A->cmap->n; 940 tempT->num_entries = a->nz; 941 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 942 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 943 tempT->values = new THRUSTARRAY(a->nz); 944 945 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 946 temp->num_cols, temp->num_entries, 947 temp->values->data().get(), 948 temp->row_offsets->data().get(), 949 temp->column_indices->data().get(), 950 tempT->values->data().get(), 951 tempT->column_indices->data().get(), 952 tempT->row_offsets->data().get(), 953 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 954 955 /* Last, convert CSC to HYB */ 956 cusparseHybMat_t hybMat; 957 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 958 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 959 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 960 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 961 matstructT->descr, tempT->values->data().get(), 962 tempT->row_offsets->data().get(), 963 tempT->column_indices->data().get(), 964 hybMat, 0, partition);CHKERRCUSPARSE(stat); 965 966 /* assign the pointer */ 967 matstructT->mat = hybMat; 968 ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr); 969 970 /* delete temporaries */ 971 if (tempT) { 972 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 973 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 974 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 975 delete (CsrMatrix*) tempT; 976 } 977 if (temp) { 978 if (temp->values) delete (THRUSTARRAY*) temp->values; 979 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 980 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 981 delete (CsrMatrix*) temp; 982 } 983 } 984 /* the compressed row indices is not used for matTranspose */ 985 matstructT->cprowIndices = NULL; 986 /* assign the pointer */ 987 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 988 PetscFunctionReturn(0); 989 } 990 991 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 992 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 993 { 994 PetscInt n = xx->map->n; 995 const PetscScalar *barray; 996 PetscScalar *xarray; 997 thrust::device_ptr<const PetscScalar> bGPU; 998 thrust::device_ptr<PetscScalar> xGPU; 999 cusparseStatus_t stat; 1000 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1001 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1002 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1003 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1004 PetscErrorCode ierr; 1005 cudaError_t cerr; 1006 1007 PetscFunctionBegin; 1008 /* Analyze the matrix and create the transpose ... on the fly */ 1009 if (!loTriFactorT && !upTriFactorT) { 1010 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1011 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1012 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1013 } 1014 1015 /* Get the GPU pointers */ 1016 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1017 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1018 xGPU = thrust::device_pointer_cast(xarray); 1019 bGPU = thrust::device_pointer_cast(barray); 1020 1021 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1022 /* First, reorder with the row permutation */ 1023 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1024 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1025 xGPU); 1026 1027 /* First, solve U */ 1028 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1029 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1030 upTriFactorT->csrMat->values->data().get(), 1031 upTriFactorT->csrMat->row_offsets->data().get(), 1032 upTriFactorT->csrMat->column_indices->data().get(), 1033 upTriFactorT->solveInfo, 1034 xarray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1035 1036 /* Then, solve L */ 1037 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1038 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1039 loTriFactorT->csrMat->values->data().get(), 1040 loTriFactorT->csrMat->row_offsets->data().get(), 1041 loTriFactorT->csrMat->column_indices->data().get(), 1042 loTriFactorT->solveInfo, 1043 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1044 1045 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1046 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1047 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1048 tempGPU->begin()); 1049 1050 /* Copy the temporary to the full solution. */ 1051 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1052 1053 /* restore */ 1054 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1055 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1056 cerr = WaitForGPU();CHKERRCUDA(cerr); 1057 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1058 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1059 PetscFunctionReturn(0); 1060 } 1061 1062 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1063 { 1064 const PetscScalar *barray; 1065 PetscScalar *xarray; 1066 cusparseStatus_t stat; 1067 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1068 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1069 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1070 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1071 PetscErrorCode ierr; 1072 cudaError_t cerr; 1073 1074 PetscFunctionBegin; 1075 /* Analyze the matrix and create the transpose ... on the fly */ 1076 if (!loTriFactorT && !upTriFactorT) { 1077 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1078 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1079 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1080 } 1081 1082 /* Get the GPU pointers */ 1083 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1084 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1085 1086 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1087 /* First, solve U */ 1088 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1089 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1090 upTriFactorT->csrMat->values->data().get(), 1091 upTriFactorT->csrMat->row_offsets->data().get(), 1092 upTriFactorT->csrMat->column_indices->data().get(), 1093 upTriFactorT->solveInfo, 1094 barray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1095 1096 /* Then, solve L */ 1097 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1098 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1099 loTriFactorT->csrMat->values->data().get(), 1100 loTriFactorT->csrMat->row_offsets->data().get(), 1101 loTriFactorT->csrMat->column_indices->data().get(), 1102 loTriFactorT->solveInfo, 1103 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1104 1105 /* restore */ 1106 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1107 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1108 cerr = WaitForGPU();CHKERRCUDA(cerr); 1109 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1110 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1111 PetscFunctionReturn(0); 1112 } 1113 1114 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1115 { 1116 const PetscScalar *barray; 1117 PetscScalar *xarray; 1118 thrust::device_ptr<const PetscScalar> bGPU; 1119 thrust::device_ptr<PetscScalar> xGPU; 1120 cusparseStatus_t stat; 1121 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1122 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1123 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1124 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1125 PetscErrorCode ierr; 1126 cudaError_t cerr; 1127 1128 PetscFunctionBegin; 1129 1130 /* Get the GPU pointers */ 1131 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1132 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1133 xGPU = thrust::device_pointer_cast(xarray); 1134 bGPU = thrust::device_pointer_cast(barray); 1135 1136 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1137 /* First, reorder with the row permutation */ 1138 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1139 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1140 tempGPU->begin()); 1141 1142 /* Next, solve L */ 1143 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1144 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1145 loTriFactor->csrMat->values->data().get(), 1146 loTriFactor->csrMat->row_offsets->data().get(), 1147 loTriFactor->csrMat->column_indices->data().get(), 1148 loTriFactor->solveInfo, 1149 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1150 1151 /* Then, solve U */ 1152 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1153 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1154 upTriFactor->csrMat->values->data().get(), 1155 upTriFactor->csrMat->row_offsets->data().get(), 1156 upTriFactor->csrMat->column_indices->data().get(), 1157 upTriFactor->solveInfo, 1158 xarray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1159 1160 /* Last, reorder with the column permutation */ 1161 thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1162 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1163 xGPU); 1164 1165 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1166 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1167 cerr = WaitForGPU();CHKERRCUDA(cerr); 1168 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1169 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1170 PetscFunctionReturn(0); 1171 } 1172 1173 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1174 { 1175 const PetscScalar *barray; 1176 PetscScalar *xarray; 1177 cusparseStatus_t stat; 1178 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1179 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1180 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1181 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1182 PetscErrorCode ierr; 1183 cudaError_t cerr; 1184 1185 PetscFunctionBegin; 1186 /* Get the GPU pointers */ 1187 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1188 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1189 1190 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1191 /* First, solve L */ 1192 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1193 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1194 loTriFactor->csrMat->values->data().get(), 1195 loTriFactor->csrMat->row_offsets->data().get(), 1196 loTriFactor->csrMat->column_indices->data().get(), 1197 loTriFactor->solveInfo, 1198 barray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1199 1200 /* Next, solve U */ 1201 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1202 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1203 upTriFactor->csrMat->values->data().get(), 1204 upTriFactor->csrMat->row_offsets->data().get(), 1205 upTriFactor->csrMat->column_indices->data().get(), 1206 upTriFactor->solveInfo, 1207 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1208 1209 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1210 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1211 cerr = WaitForGPU();CHKERRCUDA(cerr); 1212 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1213 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1214 PetscFunctionReturn(0); 1215 } 1216 1217 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1218 { 1219 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1220 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1221 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1222 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1223 PetscErrorCode ierr; 1224 cusparseStatus_t stat; 1225 cudaError_t err; 1226 1227 PetscFunctionBegin; 1228 if (A->boundtocpu) PetscFunctionReturn(0); 1229 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1230 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1231 if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1232 /* Copy values only */ 1233 CsrMatrix *mat,*matT; 1234 mat = (CsrMatrix*)cusparsestruct->mat->mat; 1235 mat->values->assign(a->a, a->a+a->nz); 1236 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1237 1238 /* Update matT when it was built before */ 1239 if (cusparsestruct->matTranspose) { 1240 cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr); 1241 matT = (CsrMatrix*)cusparsestruct->matTranspose->mat; 1242 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1243 A->cmap->n, mat->num_entries, 1244 mat->values->data().get(), 1245 cusparsestruct->rowoffsets_gpu->data().get(), 1246 mat->column_indices->data().get(), 1247 matT->values->data().get(), 1248 matT->column_indices->data().get(), 1249 matT->row_offsets->data().get(), 1250 CUSPARSE_ACTION_NUMERIC,indexBase);CHKERRCUSPARSE(stat); 1251 } 1252 } else { 1253 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1254 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr); 1255 delete cusparsestruct->workVector; 1256 delete cusparsestruct->rowoffsets_gpu; 1257 try { 1258 if (a->compressedrow.use) { 1259 m = a->compressedrow.nrows; 1260 ii = a->compressedrow.i; 1261 ridx = a->compressedrow.rindex; 1262 } else { 1263 m = A->rmap->n; 1264 ii = a->i; 1265 ridx = NULL; /* In this case, one should not dereference ridx */ 1266 } 1267 cusparsestruct->nrows = m; 1268 1269 /* allocate space for the triangular factor information */ 1270 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1271 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1272 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1273 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1274 1275 err = cudaMalloc((void **)&(matstruct->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 1276 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1277 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1278 err = cudaMemcpy(matstruct->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1279 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1280 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1281 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1282 1283 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1284 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1285 /* set the matrix */ 1286 CsrMatrix *matrix= new CsrMatrix; 1287 matrix->num_rows = m; 1288 matrix->num_cols = A->cmap->n; 1289 matrix->num_entries = a->nz; 1290 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1291 matrix->row_offsets->assign(ii, ii + m+1); 1292 1293 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1294 matrix->column_indices->assign(a->j, a->j+a->nz); 1295 1296 matrix->values = new THRUSTARRAY(a->nz); 1297 matrix->values->assign(a->a, a->a+a->nz); 1298 1299 /* assign the pointer */ 1300 matstruct->mat = matrix; 1301 1302 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1303 CsrMatrix *matrix= new CsrMatrix; 1304 matrix->num_rows = m; 1305 matrix->num_cols = A->cmap->n; 1306 matrix->num_entries = a->nz; 1307 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1308 matrix->row_offsets->assign(ii, ii + m+1); 1309 1310 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1311 matrix->column_indices->assign(a->j, a->j+a->nz); 1312 1313 matrix->values = new THRUSTARRAY(a->nz); 1314 matrix->values->assign(a->a, a->a+a->nz); 1315 1316 cusparseHybMat_t hybMat; 1317 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1318 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1319 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1320 stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1321 matstruct->descr, matrix->values->data().get(), 1322 matrix->row_offsets->data().get(), 1323 matrix->column_indices->data().get(), 1324 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1325 /* assign the pointer */ 1326 matstruct->mat = hybMat; 1327 1328 if (matrix) { 1329 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1330 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1331 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1332 delete (CsrMatrix*)matrix; 1333 } 1334 } 1335 1336 /* assign the compressed row indices */ 1337 if (a->compressedrow.use) { 1338 cusparsestruct->workVector = new THRUSTARRAY(m); 1339 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1340 matstruct->cprowIndices->assign(ridx,ridx+m); 1341 tmp = m; 1342 } else { 1343 cusparsestruct->workVector = NULL; 1344 matstruct->cprowIndices = NULL; 1345 tmp = 0; 1346 } 1347 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1348 1349 /* assign the pointer */ 1350 cusparsestruct->mat = matstruct; 1351 } catch(char *ex) { 1352 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1353 } 1354 cusparsestruct->nonzerostate = A->nonzerostate; 1355 } 1356 err = WaitForGPU();CHKERRCUDA(err); 1357 A->offloadmask = PETSC_OFFLOAD_BOTH; 1358 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1359 } 1360 PetscFunctionReturn(0); 1361 } 1362 1363 struct VecCUDAPlusEquals 1364 { 1365 template <typename Tuple> 1366 __host__ __device__ 1367 void operator()(Tuple t) 1368 { 1369 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1370 } 1371 }; 1372 1373 typedef struct { 1374 PetscBool cisdense; 1375 PetscScalar *Bt; 1376 Mat X; 1377 } MatMatCusparse; 1378 1379 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1380 { 1381 PetscErrorCode ierr; 1382 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1383 cudaError_t cerr; 1384 1385 PetscFunctionBegin; 1386 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1387 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1388 ierr = PetscFree(data);CHKERRQ(ierr); 1389 PetscFunctionReturn(0); 1390 } 1391 1392 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1393 1394 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1395 { 1396 Mat_Product *product = C->product; 1397 Mat A,B; 1398 PetscInt m,n,k,blda,clda; 1399 PetscBool flg,biscuda; 1400 Mat_SeqAIJCUSPARSE *cusp; 1401 cusparseStatus_t stat; 1402 cusparseOperation_t opA; 1403 cusparseMatDescr_t matA; 1404 const PetscScalar *barray; 1405 PetscScalar *carray; 1406 PetscErrorCode ierr; 1407 MatMatCusparse *mmdata; 1408 Mat_SeqAIJCUSPARSEMultStruct *mat; 1409 CsrMatrix *csrmat; 1410 1411 PetscFunctionBegin; 1412 MatCheckProduct(C,1); 1413 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 1414 mmdata = (MatMatCusparse*)product->data; 1415 A = product->A; 1416 B = product->B; 1417 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1418 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1419 /* currently CopyToGpu does not copy if the matrix is bound to CPU 1420 Instead of silently accepting the wrong answer, I prefer to raise the error */ 1421 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 1422 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1423 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1424 switch (product->type) { 1425 case MATPRODUCT_AB: 1426 case MATPRODUCT_PtAP: 1427 mat = cusp->mat; 1428 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1429 m = A->rmap->n; 1430 k = A->cmap->n; 1431 n = B->cmap->n; 1432 break; 1433 case MATPRODUCT_AtB: 1434 if (!cusp->matTranspose) { 1435 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1436 } 1437 mat = cusp->matTranspose; 1438 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1439 m = A->cmap->n; 1440 k = A->rmap->n; 1441 n = B->cmap->n; 1442 break; 1443 case MATPRODUCT_ABt: 1444 case MATPRODUCT_RARt: 1445 mat = cusp->mat; 1446 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1447 m = A->rmap->n; 1448 k = B->cmap->n; 1449 n = B->rmap->n; 1450 break; 1451 break; 1452 default: 1453 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1454 } 1455 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 1456 matA = mat->descr; 1457 csrmat = (CsrMatrix*)mat->mat; 1458 /* if the user passed a CPU matrix, copy the data to the GPU */ 1459 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 1460 if (!biscuda) { 1461 ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 1462 } 1463 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 1464 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 1465 /* cusparse spmm does not support transpose on B */ 1466 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 1467 cublasHandle_t cublasv2handle; 1468 cublasStatus_t cerr; 1469 1470 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 1471 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 1472 B->cmap->n,B->rmap->n, 1473 &PETSC_CUSPARSE_ONE ,barray,blda, 1474 &PETSC_CUSPARSE_ZERO,barray,blda, 1475 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 1476 blda = B->cmap->n; 1477 } 1478 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 1479 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1480 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 1481 } else { 1482 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 1483 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 1484 } 1485 1486 /* perform the MatMat operation */ 1487 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 1488 csrmat->num_entries,mat->alpha,matA, 1489 csrmat->values->data().get(), 1490 csrmat->row_offsets->data().get(), 1491 csrmat->column_indices->data().get(), 1492 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 1493 carray,clda);CHKERRCUSPARSE(stat); 1494 1495 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 1496 if (product->type == MATPRODUCT_RARt) { 1497 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1498 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 1499 } else if (product->type == MATPRODUCT_PtAP) { 1500 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1501 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 1502 } else { 1503 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 1504 } 1505 if (mmdata->cisdense) { 1506 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 1507 } 1508 if (!biscuda) { 1509 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 1510 } 1511 PetscFunctionReturn(0); 1512 } 1513 1514 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1515 { 1516 Mat_Product *product = C->product; 1517 Mat A,B; 1518 PetscInt m,n; 1519 PetscBool cisdense,flg; 1520 PetscErrorCode ierr; 1521 MatMatCusparse *mmdata; 1522 Mat_SeqAIJCUSPARSE *cusp; 1523 1524 PetscFunctionBegin; 1525 MatCheckProduct(C,1); 1526 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 1527 A = product->A; 1528 B = product->B; 1529 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1530 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1531 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1532 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 1533 switch (product->type) { 1534 case MATPRODUCT_AB: 1535 m = A->rmap->n; 1536 n = B->cmap->n; 1537 break; 1538 case MATPRODUCT_AtB: 1539 m = A->cmap->n; 1540 n = B->cmap->n; 1541 break; 1542 case MATPRODUCT_ABt: 1543 m = A->rmap->n; 1544 n = B->rmap->n; 1545 break; 1546 case MATPRODUCT_PtAP: 1547 m = B->cmap->n; 1548 n = B->cmap->n; 1549 break; 1550 case MATPRODUCT_RARt: 1551 m = B->rmap->n; 1552 n = B->rmap->n; 1553 break; 1554 default: 1555 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1556 } 1557 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 1558 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 1559 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 1560 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 1561 1562 /* product data */ 1563 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 1564 mmdata->cisdense = cisdense; 1565 /* cusparse spmm does not support transpose on B */ 1566 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 1567 cudaError_t cerr; 1568 1569 cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 1570 } 1571 /* for these products we need intermediate storage */ 1572 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 1573 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 1574 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 1575 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 1576 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 1577 } else { 1578 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 1579 } 1580 } 1581 C->product->data = mmdata; 1582 C->product->destroy = MatDestroy_MatMatCusparse; 1583 1584 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 1585 PetscFunctionReturn(0); 1586 } 1587 1588 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 1589 1590 /* handles dense B */ 1591 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C) 1592 { 1593 Mat_Product *product = C->product; 1594 PetscErrorCode ierr; 1595 1596 PetscFunctionBegin; 1597 MatCheckProduct(C,1); 1598 if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A"); 1599 if (product->A->boundtocpu) { 1600 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr); 1601 PetscFunctionReturn(0); 1602 } 1603 switch (product->type) { 1604 case MATPRODUCT_AB: 1605 case MATPRODUCT_AtB: 1606 case MATPRODUCT_ABt: 1607 case MATPRODUCT_PtAP: 1608 case MATPRODUCT_RARt: 1609 C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 1610 default: 1611 break; 1612 } 1613 PetscFunctionReturn(0); 1614 } 1615 1616 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1617 { 1618 PetscErrorCode ierr; 1619 1620 PetscFunctionBegin; 1621 ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr); 1622 PetscFunctionReturn(0); 1623 } 1624 1625 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1626 { 1627 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1628 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1629 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1630 const PetscScalar *xarray; 1631 PetscScalar *yarray; 1632 PetscErrorCode ierr; 1633 cudaError_t cerr; 1634 cusparseStatus_t stat; 1635 1636 PetscFunctionBegin; 1637 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1638 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1639 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1640 if (!matstructT) { 1641 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1642 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1643 } 1644 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1645 ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1646 if (yy->map->n) { 1647 PetscInt n = yy->map->n; 1648 cudaError_t err; 1649 err = cudaMemset(yarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err); /* hack to fix numerical errors from reading output vector yy, apparently */ 1650 } 1651 1652 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1653 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1654 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1655 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1656 mat->num_rows, mat->num_cols, 1657 mat->num_entries, matstructT->alpha, matstructT->descr, 1658 mat->values->data().get(), mat->row_offsets->data().get(), 1659 mat->column_indices->data().get(), xarray, matstructT->beta_zero, 1660 yarray);CHKERRCUSPARSE(stat); 1661 } else { 1662 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1663 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1664 matstructT->alpha, matstructT->descr, hybMat, 1665 xarray, matstructT->beta_zero, 1666 yarray);CHKERRCUSPARSE(stat); 1667 } 1668 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1669 ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1670 cerr = WaitForGPU();CHKERRCUDA(cerr); 1671 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1672 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1673 PetscFunctionReturn(0); 1674 } 1675 1676 1677 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1678 { 1679 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1680 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1681 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 1682 const PetscScalar *xarray; 1683 PetscScalar *zarray,*dptr,*beta; 1684 PetscErrorCode ierr; 1685 cudaError_t cerr; 1686 cusparseStatus_t stat; 1687 PetscBool compressed = a->compressedrow.use; /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 1688 1689 PetscFunctionBegin; 1690 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1691 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1692 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1693 try { 1694 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1695 1696 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 1697 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 1698 1699 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; /* Use a work vector for SpMv result if compressed */ 1700 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 1701 1702 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1703 /* csr_spmv does y = alpha*Ax + beta*y */ 1704 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1705 /* here we need to be careful to set the number of rows in the multiply to the 1706 number of compressed rows in the matrix ... which is equivalent to the 1707 size of the workVector */ 1708 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1709 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1710 mat->num_rows, mat->num_cols, 1711 mat->num_entries, matstruct->alpha, matstruct->descr, 1712 mat->values->data().get(), mat->row_offsets->data().get(), 1713 mat->column_indices->data().get(), xarray, beta, 1714 dptr);CHKERRCUSPARSE(stat); 1715 } else { 1716 if (cusparsestruct->nrows) { 1717 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1718 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1719 matstruct->alpha, matstruct->descr, hybMat, 1720 xarray, beta, 1721 dptr);CHKERRCUSPARSE(stat); 1722 } 1723 } 1724 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1725 1726 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 1727 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 1728 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 1729 } else if (zz != yy) { /* A's not compreseed. zz already contains A*xx, and we just need to add yy */ 1730 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 1731 } 1732 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 1733 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 1734 } 1735 1736 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 1737 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1738 if (compressed) { 1739 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 1740 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 1741 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1742 VecCUDAPlusEquals()); 1743 } 1744 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1745 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1746 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 1747 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 1748 } catch(char *ex) { 1749 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1750 } 1751 cerr = WaitForGPU();CHKERRCUDA(cerr); 1752 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1753 PetscFunctionReturn(0); 1754 } 1755 1756 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1757 { 1758 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1759 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1760 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1761 const PetscScalar *xarray; 1762 PetscScalar *zarray,*beta; 1763 PetscErrorCode ierr; 1764 cudaError_t cerr; 1765 cusparseStatus_t stat; 1766 1767 PetscFunctionBegin; 1768 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1769 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1770 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1771 if (!matstructT) { 1772 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1773 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1774 } 1775 1776 /* Note unlike Mat, MatTranspose uses non-compressed row storage */ 1777 try { 1778 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1779 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1780 ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr); 1781 beta = (yy == zz) ? matstructT->beta_one : matstructT->beta_zero; 1782 1783 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1784 /* multiply add with matrix transpose */ 1785 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1786 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1787 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1788 mat->num_rows, mat->num_cols, 1789 mat->num_entries, matstructT->alpha, matstructT->descr, 1790 mat->values->data().get(), mat->row_offsets->data().get(), 1791 mat->column_indices->data().get(), xarray, beta, 1792 zarray);CHKERRCUSPARSE(stat); 1793 } else { 1794 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1795 if (cusparsestruct->nrows) { 1796 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1797 matstructT->alpha, matstructT->descr, hybMat, 1798 xarray, beta, 1799 zarray);CHKERRCUSPARSE(stat); 1800 } 1801 } 1802 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1803 1804 if (zz != yy) {ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr);} 1805 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1806 ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr); 1807 } catch(char *ex) { 1808 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1809 } 1810 cerr = WaitForGPU();CHKERRCUDA(cerr); 1811 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1812 PetscFunctionReturn(0); 1813 } 1814 1815 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1816 { 1817 PetscErrorCode ierr; 1818 1819 PetscFunctionBegin; 1820 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1821 if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 1822 if (A->factortype == MAT_FACTOR_NONE) { 1823 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1824 } 1825 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1826 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1827 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1828 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1829 PetscFunctionReturn(0); 1830 } 1831 1832 /* --------------------------------------------------------------------------------*/ 1833 /*@ 1834 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1835 (the default parallel PETSc format). This matrix will ultimately pushed down 1836 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1837 assembly performance the user should preallocate the matrix storage by setting 1838 the parameter nz (or the array nnz). By setting these parameters accurately, 1839 performance during matrix assembly can be increased by more than a factor of 50. 1840 1841 Collective 1842 1843 Input Parameters: 1844 + comm - MPI communicator, set to PETSC_COMM_SELF 1845 . m - number of rows 1846 . n - number of columns 1847 . nz - number of nonzeros per row (same for all rows) 1848 - nnz - array containing the number of nonzeros in the various rows 1849 (possibly different for each row) or NULL 1850 1851 Output Parameter: 1852 . A - the matrix 1853 1854 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1855 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1856 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1857 1858 Notes: 1859 If nnz is given then nz is ignored 1860 1861 The AIJ format (also called the Yale sparse matrix format or 1862 compressed row storage), is fully compatible with standard Fortran 77 1863 storage. That is, the stored row and column indices can begin at 1864 either one (as in Fortran) or zero. See the users' manual for details. 1865 1866 Specify the preallocated storage with either nz or nnz (not both). 1867 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1868 allocation. For large problems you MUST preallocate memory or you 1869 will get TERRIBLE performance, see the users' manual chapter on matrices. 1870 1871 By default, this format uses inodes (identical nodes) when possible, to 1872 improve numerical efficiency of matrix-vector products and solves. We 1873 search for consecutive rows with the same nonzero structure, thereby 1874 reusing matrix information to achieve increased efficiency. 1875 1876 Level: intermediate 1877 1878 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1879 @*/ 1880 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1881 { 1882 PetscErrorCode ierr; 1883 1884 PetscFunctionBegin; 1885 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1886 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1887 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1888 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1889 PetscFunctionReturn(0); 1890 } 1891 1892 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1893 { 1894 PetscErrorCode ierr; 1895 1896 PetscFunctionBegin; 1897 if (A->factortype == MAT_FACTOR_NONE) { 1898 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) { 1899 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 1900 } 1901 } else { 1902 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 1903 } 1904 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 1905 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 1906 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 1907 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 1908 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1909 PetscFunctionReturn(0); 1910 } 1911 1912 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 1913 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 1914 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 1915 { 1916 PetscErrorCode ierr; 1917 1918 PetscFunctionBegin; 1919 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 1920 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 1921 PetscFunctionReturn(0); 1922 } 1923 1924 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 1925 { 1926 PetscFunctionBegin; 1927 /* Currently, there is no case in which an AIJCUSPARSE matrix ever has its offloadmask set to PETS_OFFLOAD_GPU. 1928 If this changes, we need to implement a routine to update the CPU (host) version of the matrix from the GPU one. 1929 Right now, for safety we simply check for PETSC_OFFLOAD_GPU and have MatBindToCPU() do nothing in this case. 1930 TODO: Add MatAIJCUSPARSECopyFromGPU() and make MatBindToCPU() functional for AIJCUSPARSE matries; 1931 can follow the example of MatBindToCPU_SeqAIJViennaCL(). */ 1932 /* We need to take care of function composition also */ 1933 if (A->offloadmask == PETSC_OFFLOAD_GPU) PetscFunctionReturn(0); 1934 if (flg) { 1935 A->ops->mult = MatMult_SeqAIJ; 1936 A->ops->multadd = MatMultAdd_SeqAIJ; 1937 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 1938 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 1939 A->ops->assemblyend = MatAssemblyEnd_SeqAIJ; 1940 A->ops->duplicate = MatDuplicate_SeqAIJ; 1941 } else { 1942 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1943 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1944 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1945 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1946 A->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1947 A->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1948 A->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1949 } 1950 A->boundtocpu = flg; 1951 PetscFunctionReturn(0); 1952 } 1953 1954 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 1955 { 1956 PetscErrorCode ierr; 1957 cusparseStatus_t stat; 1958 cusparseHandle_t handle=0; 1959 1960 PetscFunctionBegin; 1961 if (reuse != MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not yet supported"); 1962 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 1963 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 1964 1965 if (B->factortype == MAT_FACTOR_NONE) { 1966 /* you cannot check the inode.use flag here since the matrix was just created. 1967 now build a GPU matrix data structure */ 1968 B->spptr = new Mat_SeqAIJCUSPARSE; 1969 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1970 ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0; 1971 ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0; 1972 ((Mat_SeqAIJCUSPARSE*)B->spptr)->rowoffsets_gpu = 0; 1973 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1974 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1975 stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat); 1976 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle; 1977 ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0; 1978 } else { 1979 /* NEXT, set the pointers to the triangular factors */ 1980 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1981 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 1982 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1983 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0; 1984 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0; 1985 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0; 1986 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0; 1987 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0; 1988 stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat); 1989 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle; 1990 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0; 1991 } 1992 1993 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1994 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1995 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1996 B->ops->mult = MatMult_SeqAIJCUSPARSE; 1997 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1998 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1999 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 2000 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 2001 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 2002 2003 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2004 2005 B->boundtocpu = PETSC_FALSE; 2006 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2007 2008 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 2009 ierr = PetscObjectComposeFunction((PetscObject)B,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 2010 ierr = PetscObjectComposeFunction((PetscObject)B,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 2011 PetscFunctionReturn(0); 2012 } 2013 2014 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 2015 { 2016 PetscErrorCode ierr; 2017 2018 PetscFunctionBegin; 2019 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 2020 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2021 PetscFunctionReturn(0); 2022 } 2023 2024 /*MC 2025 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 2026 2027 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 2028 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 2029 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 2030 2031 Options Database Keys: 2032 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 2033 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2034 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2035 2036 Level: beginner 2037 2038 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 2039 M*/ 2040 2041 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 2042 2043 2044 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 2045 { 2046 PetscErrorCode ierr; 2047 2048 PetscFunctionBegin; 2049 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2050 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2051 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2052 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2053 PetscFunctionReturn(0); 2054 } 2055 2056 2057 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 2058 { 2059 cusparseStatus_t stat; 2060 cusparseHandle_t handle; 2061 2062 PetscFunctionBegin; 2063 if (*cusparsestruct) { 2064 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format); 2065 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format); 2066 delete (*cusparsestruct)->workVector; 2067 delete (*cusparsestruct)->rowoffsets_gpu; 2068 if (handle = (*cusparsestruct)->handle) { 2069 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 2070 } 2071 delete *cusparsestruct; 2072 *cusparsestruct = 0; 2073 } 2074 PetscFunctionReturn(0); 2075 } 2076 2077 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 2078 { 2079 PetscFunctionBegin; 2080 if (*mat) { 2081 delete (*mat)->values; 2082 delete (*mat)->column_indices; 2083 delete (*mat)->row_offsets; 2084 delete *mat; 2085 *mat = 0; 2086 } 2087 PetscFunctionReturn(0); 2088 } 2089 2090 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 2091 { 2092 cusparseStatus_t stat; 2093 PetscErrorCode ierr; 2094 2095 PetscFunctionBegin; 2096 if (*trifactor) { 2097 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 2098 if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 2099 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 2100 delete *trifactor; 2101 *trifactor = 0; 2102 } 2103 PetscFunctionReturn(0); 2104 } 2105 2106 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 2107 { 2108 CsrMatrix *mat; 2109 cusparseStatus_t stat; 2110 cudaError_t err; 2111 2112 PetscFunctionBegin; 2113 if (*matstruct) { 2114 if ((*matstruct)->mat) { 2115 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 2116 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 2117 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 2118 } else { 2119 mat = (CsrMatrix*)(*matstruct)->mat; 2120 CsrMatrix_Destroy(&mat); 2121 } 2122 } 2123 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 2124 delete (*matstruct)->cprowIndices; 2125 if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); } 2126 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 2127 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 2128 delete *matstruct; 2129 *matstruct = 0; 2130 } 2131 PetscFunctionReturn(0); 2132 } 2133 2134 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors) 2135 { 2136 PetscFunctionBegin; 2137 if (*trifactors) { 2138 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr); 2139 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr); 2140 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose); 2141 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose); 2142 delete (*trifactors)->rpermIndices; 2143 delete (*trifactors)->cpermIndices; 2144 delete (*trifactors)->workVector; 2145 (*trifactors)->rpermIndices = 0; 2146 (*trifactors)->cpermIndices = 0; 2147 (*trifactors)->workVector = 0; 2148 } 2149 PetscFunctionReturn(0); 2150 } 2151 2152 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 2153 { 2154 cusparseHandle_t handle; 2155 cusparseStatus_t stat; 2156 2157 PetscFunctionBegin; 2158 if (*trifactors) { 2159 MatSeqAIJCUSPARSETriFactors_Reset(trifactors); 2160 if (handle = (*trifactors)->handle) { 2161 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 2162 } 2163 delete *trifactors; 2164 *trifactors = 0; 2165 } 2166 PetscFunctionReturn(0); 2167 } 2168 2169