1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_CXX_COMPLEX_FIX 7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 8 9 #include <petscconf.h> 10 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11 #include <../src/mat/impls/sbaij/seq/sbaij.h> 12 #include <../src/vec/vec/impls/dvecimpl.h> 13 #include <petsc/private/vecimpl.h> 14 #undef VecType 15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 19 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 20 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 21 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 22 23 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 24 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 25 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 26 27 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 28 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 29 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 30 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 31 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 32 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 33 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 34 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 35 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 36 37 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 38 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 39 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 40 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**); 41 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 42 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 43 44 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 45 { 46 cusparseStatus_t stat; 47 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 48 49 PetscFunctionBegin; 50 cusparsestruct->stream = stream; 51 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 52 PetscFunctionReturn(0); 53 } 54 55 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 56 { 57 cusparseStatus_t stat; 58 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 59 60 PetscFunctionBegin; 61 if (cusparsestruct->handle != handle) { 62 if (cusparsestruct->handle) { 63 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 64 } 65 cusparsestruct->handle = handle; 66 } 67 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 68 PetscFunctionReturn(0); 69 } 70 71 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 72 { 73 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 74 75 PetscFunctionBegin; 76 if (cusparsestruct->handle) cusparsestruct->handle = 0; 77 PetscFunctionReturn(0); 78 } 79 80 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 81 { 82 PetscFunctionBegin; 83 *type = MATSOLVERCUSPARSE; 84 PetscFunctionReturn(0); 85 } 86 87 /*MC 88 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 89 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 90 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 91 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 92 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 93 algorithms are not recommended. This class does NOT support direct solver operations. 94 95 Level: beginner 96 97 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 98 M*/ 99 100 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 101 { 102 PetscErrorCode ierr; 103 PetscInt n = A->rmap->n; 104 105 PetscFunctionBegin; 106 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 107 (*B)->factortype = ftype; 108 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 109 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 110 111 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 112 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 113 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 114 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 115 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 116 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 117 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 118 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 119 120 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 121 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 122 PetscFunctionReturn(0); 123 } 124 125 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 126 { 127 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 128 129 PetscFunctionBegin; 130 switch (op) { 131 case MAT_CUSPARSE_MULT: 132 cusparsestruct->format = format; 133 break; 134 case MAT_CUSPARSE_ALL: 135 cusparsestruct->format = format; 136 break; 137 default: 138 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 139 } 140 PetscFunctionReturn(0); 141 } 142 143 /*@ 144 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 145 operation. Only the MatMult operation can use different GPU storage formats 146 for MPIAIJCUSPARSE matrices. 147 Not Collective 148 149 Input Parameters: 150 + A - Matrix of type SEQAIJCUSPARSE 151 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 152 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 153 154 Output Parameter: 155 156 Level: intermediate 157 158 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 159 @*/ 160 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 161 { 162 PetscErrorCode ierr; 163 164 PetscFunctionBegin; 165 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 166 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 167 PetscFunctionReturn(0); 168 } 169 170 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 171 { 172 PetscErrorCode ierr; 173 MatCUSPARSEStorageFormat format; 174 PetscBool flg; 175 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 176 177 PetscFunctionBegin; 178 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 179 if (A->factortype==MAT_FACTOR_NONE) { 180 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 181 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 182 if (flg) { 183 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 184 } 185 } 186 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 187 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 188 if (flg) { 189 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 190 } 191 ierr = PetscOptionsTail();CHKERRQ(ierr); 192 PetscFunctionReturn(0); 193 } 194 195 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 196 { 197 PetscErrorCode ierr; 198 199 PetscFunctionBegin; 200 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 201 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 202 PetscFunctionReturn(0); 203 } 204 205 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 206 { 207 PetscErrorCode ierr; 208 209 PetscFunctionBegin; 210 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 211 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 212 PetscFunctionReturn(0); 213 } 214 215 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 216 { 217 PetscErrorCode ierr; 218 219 PetscFunctionBegin; 220 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 221 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 222 PetscFunctionReturn(0); 223 } 224 225 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 226 { 227 PetscErrorCode ierr; 228 229 PetscFunctionBegin; 230 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 231 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 232 PetscFunctionReturn(0); 233 } 234 235 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 236 { 237 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 238 PetscInt n = A->rmap->n; 239 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 240 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 241 cusparseStatus_t stat; 242 const PetscInt *ai = a->i,*aj = a->j,*vi; 243 const MatScalar *aa = a->a,*v; 244 PetscInt *AiLo, *AjLo; 245 PetscScalar *AALo; 246 PetscInt i,nz, nzLower, offset, rowOffset; 247 PetscErrorCode ierr; 248 cudaError_t cerr; 249 250 PetscFunctionBegin; 251 if (!n) PetscFunctionReturn(0); 252 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 253 try { 254 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 255 nzLower=n+ai[n]-ai[1]; 256 257 /* Allocate Space for the lower triangular matrix */ 258 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 259 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 260 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 261 262 /* Fill the lower triangular matrix */ 263 AiLo[0] = (PetscInt) 0; 264 AiLo[n] = nzLower; 265 AjLo[0] = (PetscInt) 0; 266 AALo[0] = (MatScalar) 1.0; 267 v = aa; 268 vi = aj; 269 offset = 1; 270 rowOffset= 1; 271 for (i=1; i<n; i++) { 272 nz = ai[i+1] - ai[i]; 273 /* additional 1 for the term on the diagonal */ 274 AiLo[i] = rowOffset; 275 rowOffset += nz+1; 276 277 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 278 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 279 280 offset += nz; 281 AjLo[offset] = (PetscInt) i; 282 AALo[offset] = (MatScalar) 1.0; 283 offset += 1; 284 285 v += nz; 286 vi += nz; 287 } 288 289 /* allocate space for the triangular factor information */ 290 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 291 292 /* Create the matrix description */ 293 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 294 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 295 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 296 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 297 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 298 299 /* Create the solve analysis information */ 300 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 301 302 /* set the operation */ 303 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 304 305 /* set the matrix */ 306 loTriFactor->csrMat = new CsrMatrix; 307 loTriFactor->csrMat->num_rows = n; 308 loTriFactor->csrMat->num_cols = n; 309 loTriFactor->csrMat->num_entries = nzLower; 310 311 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 312 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 313 314 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 315 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 316 317 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 318 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 319 320 /* perform the solve analysis */ 321 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 322 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 323 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 324 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 325 326 /* assign the pointer. Is this really necessary? */ 327 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 328 329 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 330 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 331 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 332 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 333 } catch(char *ex) { 334 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 335 } 336 } 337 PetscFunctionReturn(0); 338 } 339 340 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 341 { 342 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 343 PetscInt n = A->rmap->n; 344 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 345 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 346 cusparseStatus_t stat; 347 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 348 const MatScalar *aa = a->a,*v; 349 PetscInt *AiUp, *AjUp; 350 PetscScalar *AAUp; 351 PetscInt i,nz, nzUpper, offset; 352 PetscErrorCode ierr; 353 cudaError_t cerr; 354 355 PetscFunctionBegin; 356 if (!n) PetscFunctionReturn(0); 357 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 358 try { 359 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 360 nzUpper = adiag[0]-adiag[n]; 361 362 /* Allocate Space for the upper triangular matrix */ 363 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 364 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 365 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 366 367 /* Fill the upper triangular matrix */ 368 AiUp[0]=(PetscInt) 0; 369 AiUp[n]=nzUpper; 370 offset = nzUpper; 371 for (i=n-1; i>=0; i--) { 372 v = aa + adiag[i+1] + 1; 373 vi = aj + adiag[i+1] + 1; 374 375 /* number of elements NOT on the diagonal */ 376 nz = adiag[i] - adiag[i+1]-1; 377 378 /* decrement the offset */ 379 offset -= (nz+1); 380 381 /* first, set the diagonal elements */ 382 AjUp[offset] = (PetscInt) i; 383 AAUp[offset] = (MatScalar)1./v[nz]; 384 AiUp[i] = AiUp[i+1] - (nz+1); 385 386 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 387 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 388 } 389 390 /* allocate space for the triangular factor information */ 391 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 392 393 /* Create the matrix description */ 394 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 395 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 396 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 397 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 398 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 399 400 /* Create the solve analysis information */ 401 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 402 403 /* set the operation */ 404 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 405 406 /* set the matrix */ 407 upTriFactor->csrMat = new CsrMatrix; 408 upTriFactor->csrMat->num_rows = n; 409 upTriFactor->csrMat->num_cols = n; 410 upTriFactor->csrMat->num_entries = nzUpper; 411 412 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 413 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 414 415 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 416 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 417 418 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 419 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 420 421 /* perform the solve analysis */ 422 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 423 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 424 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 425 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 426 427 /* assign the pointer. Is this really necessary? */ 428 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 429 430 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 431 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 432 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 433 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 434 } catch(char *ex) { 435 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 436 } 437 } 438 PetscFunctionReturn(0); 439 } 440 441 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 442 { 443 PetscErrorCode ierr; 444 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 445 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 446 IS isrow = a->row,iscol = a->icol; 447 PetscBool row_identity,col_identity; 448 const PetscInt *r,*c; 449 PetscInt n = A->rmap->n; 450 451 PetscFunctionBegin; 452 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 453 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 454 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 455 456 cusparseTriFactors->workVector = new THRUSTARRAY(n); 457 cusparseTriFactors->nnz=a->nz; 458 459 A->offloadmask = PETSC_OFFLOAD_BOTH; 460 /* lower triangular indices */ 461 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 462 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 463 if (!row_identity) { 464 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 465 cusparseTriFactors->rpermIndices->assign(r, r+n); 466 } 467 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 468 469 /* upper triangular indices */ 470 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 471 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 472 if (!col_identity) { 473 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 474 cusparseTriFactors->cpermIndices->assign(c, c+n); 475 } 476 477 if (!row_identity && !col_identity) { 478 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 479 } else if(!row_identity) { 480 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 481 } else if(!col_identity) { 482 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 483 } 484 485 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 486 PetscFunctionReturn(0); 487 } 488 489 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 490 { 491 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 492 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 493 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 494 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 495 cusparseStatus_t stat; 496 PetscErrorCode ierr; 497 cudaError_t cerr; 498 PetscInt *AiUp, *AjUp; 499 PetscScalar *AAUp; 500 PetscScalar *AALo; 501 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 502 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 503 const PetscInt *ai = b->i,*aj = b->j,*vj; 504 const MatScalar *aa = b->a,*v; 505 506 PetscFunctionBegin; 507 if (!n) PetscFunctionReturn(0); 508 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 509 try { 510 /* Allocate Space for the upper triangular matrix */ 511 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 512 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 513 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 514 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 515 516 /* Fill the upper triangular matrix */ 517 AiUp[0]=(PetscInt) 0; 518 AiUp[n]=nzUpper; 519 offset = 0; 520 for (i=0; i<n; i++) { 521 /* set the pointers */ 522 v = aa + ai[i]; 523 vj = aj + ai[i]; 524 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 525 526 /* first, set the diagonal elements */ 527 AjUp[offset] = (PetscInt) i; 528 AAUp[offset] = (MatScalar)1.0/v[nz]; 529 AiUp[i] = offset; 530 AALo[offset] = (MatScalar)1.0/v[nz]; 531 532 offset+=1; 533 if (nz>0) { 534 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 535 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 536 for (j=offset; j<offset+nz; j++) { 537 AAUp[j] = -AAUp[j]; 538 AALo[j] = AAUp[j]/v[nz]; 539 } 540 offset+=nz; 541 } 542 } 543 544 /* allocate space for the triangular factor information */ 545 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 546 547 /* Create the matrix description */ 548 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 549 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 550 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 551 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 552 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 553 554 /* Create the solve analysis information */ 555 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 556 557 /* set the operation */ 558 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 559 560 /* set the matrix */ 561 upTriFactor->csrMat = new CsrMatrix; 562 upTriFactor->csrMat->num_rows = A->rmap->n; 563 upTriFactor->csrMat->num_cols = A->cmap->n; 564 upTriFactor->csrMat->num_entries = a->nz; 565 566 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 567 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 568 569 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 570 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 571 572 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 573 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 574 575 /* perform the solve analysis */ 576 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 577 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 578 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 579 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 580 581 /* assign the pointer. Is this really necessary? */ 582 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 583 584 /* allocate space for the triangular factor information */ 585 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 586 587 /* Create the matrix description */ 588 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 589 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 590 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 591 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 592 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 593 594 /* Create the solve analysis information */ 595 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 596 597 /* set the operation */ 598 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 599 600 /* set the matrix */ 601 loTriFactor->csrMat = new CsrMatrix; 602 loTriFactor->csrMat->num_rows = A->rmap->n; 603 loTriFactor->csrMat->num_cols = A->cmap->n; 604 loTriFactor->csrMat->num_entries = a->nz; 605 606 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 607 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 608 609 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 610 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 611 612 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 613 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 614 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 615 616 /* perform the solve analysis */ 617 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 618 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 619 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 620 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 621 622 /* assign the pointer. Is this really necessary? */ 623 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 624 625 A->offloadmask = PETSC_OFFLOAD_BOTH; 626 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 627 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 628 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 629 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 630 } catch(char *ex) { 631 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 632 } 633 } 634 PetscFunctionReturn(0); 635 } 636 637 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 638 { 639 PetscErrorCode ierr; 640 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 641 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 642 IS ip = a->row; 643 const PetscInt *rip; 644 PetscBool perm_identity; 645 PetscInt n = A->rmap->n; 646 647 PetscFunctionBegin; 648 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 649 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 650 cusparseTriFactors->workVector = new THRUSTARRAY(n); 651 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 652 653 /* lower triangular indices */ 654 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 655 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 656 if (!perm_identity) { 657 IS iip; 658 const PetscInt *irip; 659 660 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 661 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 662 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 663 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 664 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 665 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 666 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 667 ierr = ISDestroy(&iip);CHKERRQ(ierr); 668 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 669 } 670 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 671 PetscFunctionReturn(0); 672 } 673 674 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 675 { 676 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 677 IS isrow = b->row,iscol = b->col; 678 PetscBool row_identity,col_identity; 679 PetscErrorCode ierr; 680 681 PetscFunctionBegin; 682 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 683 B->offloadmask = PETSC_OFFLOAD_CPU; 684 /* determine which version of MatSolve needs to be used. */ 685 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 686 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 687 if (row_identity && col_identity) { 688 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 689 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 690 B->ops->matsolve = NULL; 691 B->ops->matsolvetranspose = NULL; 692 } else { 693 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 694 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 695 B->ops->matsolve = NULL; 696 B->ops->matsolvetranspose = NULL; 697 } 698 699 /* get the triangular factors */ 700 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 701 PetscFunctionReturn(0); 702 } 703 704 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 705 { 706 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 707 IS ip = b->row; 708 PetscBool perm_identity; 709 PetscErrorCode ierr; 710 711 PetscFunctionBegin; 712 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 713 B->offloadmask = PETSC_OFFLOAD_CPU; 714 /* determine which version of MatSolve needs to be used. */ 715 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 716 if (perm_identity) { 717 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 718 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 719 B->ops->matsolve = NULL; 720 B->ops->matsolvetranspose = NULL; 721 } else { 722 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 723 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 724 B->ops->matsolve = NULL; 725 B->ops->matsolvetranspose = NULL; 726 } 727 728 /* get the triangular factors */ 729 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 730 PetscFunctionReturn(0); 731 } 732 733 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 734 { 735 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 736 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 737 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 738 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 739 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 740 cusparseStatus_t stat; 741 cusparseIndexBase_t indexBase; 742 cusparseMatrixType_t matrixType; 743 cusparseFillMode_t fillMode; 744 cusparseDiagType_t diagType; 745 746 PetscFunctionBegin; 747 748 /*********************************************/ 749 /* Now the Transpose of the Lower Tri Factor */ 750 /*********************************************/ 751 752 /* allocate space for the transpose of the lower triangular factor */ 753 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 754 755 /* set the matrix descriptors of the lower triangular factor */ 756 matrixType = cusparseGetMatType(loTriFactor->descr); 757 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 758 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 759 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 760 diagType = cusparseGetMatDiagType(loTriFactor->descr); 761 762 /* Create the matrix description */ 763 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 764 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 765 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 766 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 767 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 768 769 /* Create the solve analysis information */ 770 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 771 772 /* set the operation */ 773 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 774 775 /* allocate GPU space for the CSC of the lower triangular factor*/ 776 loTriFactorT->csrMat = new CsrMatrix; 777 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 778 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 779 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 780 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 781 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 782 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 783 784 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 785 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 786 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 787 loTriFactor->csrMat->values->data().get(), 788 loTriFactor->csrMat->row_offsets->data().get(), 789 loTriFactor->csrMat->column_indices->data().get(), 790 loTriFactorT->csrMat->values->data().get(), 791 loTriFactorT->csrMat->column_indices->data().get(), 792 loTriFactorT->csrMat->row_offsets->data().get(), 793 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 794 795 /* perform the solve analysis on the transposed matrix */ 796 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 797 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 798 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 799 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 800 loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 801 802 /* assign the pointer. Is this really necessary? */ 803 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 804 805 /*********************************************/ 806 /* Now the Transpose of the Upper Tri Factor */ 807 /*********************************************/ 808 809 /* allocate space for the transpose of the upper triangular factor */ 810 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 811 812 /* set the matrix descriptors of the upper triangular factor */ 813 matrixType = cusparseGetMatType(upTriFactor->descr); 814 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 815 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 816 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 817 diagType = cusparseGetMatDiagType(upTriFactor->descr); 818 819 /* Create the matrix description */ 820 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 821 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 822 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 823 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 824 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 825 826 /* Create the solve analysis information */ 827 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 828 829 /* set the operation */ 830 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 831 832 /* allocate GPU space for the CSC of the upper triangular factor*/ 833 upTriFactorT->csrMat = new CsrMatrix; 834 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 835 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 836 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 837 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 838 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 839 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 840 841 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 842 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 843 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 844 upTriFactor->csrMat->values->data().get(), 845 upTriFactor->csrMat->row_offsets->data().get(), 846 upTriFactor->csrMat->column_indices->data().get(), 847 upTriFactorT->csrMat->values->data().get(), 848 upTriFactorT->csrMat->column_indices->data().get(), 849 upTriFactorT->csrMat->row_offsets->data().get(), 850 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 851 852 /* perform the solve analysis on the transposed matrix */ 853 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 854 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 855 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 856 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 857 upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 858 859 /* assign the pointer. Is this really necessary? */ 860 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 861 PetscFunctionReturn(0); 862 } 863 864 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 865 { 866 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 867 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 868 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 869 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 870 cusparseStatus_t stat; 871 cusparseIndexBase_t indexBase; 872 cudaError_t err; 873 PetscErrorCode ierr; 874 875 PetscFunctionBegin; 876 877 /* allocate space for the triangular factor information */ 878 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 879 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 880 indexBase = cusparseGetMatIndexBase(matstruct->descr); 881 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 882 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 883 884 /* set alpha and beta */ 885 err = cudaMalloc((void **)&(matstructT->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 886 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 887 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 888 err = cudaMemcpy(matstructT->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 889 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 890 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 891 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 892 893 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 894 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 895 CsrMatrix *matrixT= new CsrMatrix; 896 matrixT->num_rows = A->cmap->n; 897 matrixT->num_cols = A->rmap->n; 898 matrixT->num_entries = a->nz; 899 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 900 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 901 matrixT->values = new THRUSTARRAY(a->nz); 902 903 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 904 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 905 /* compute the transpose, i.e. the CSC */ 906 indexBase = cusparseGetMatIndexBase(matstruct->descr); 907 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 908 A->cmap->n, matrix->num_entries, 909 matrix->values->data().get(), 910 cusparsestruct->rowoffsets_gpu->data().get(), 911 matrix->column_indices->data().get(), 912 matrixT->values->data().get(), 913 matrixT->column_indices->data().get(), 914 matrixT->row_offsets->data().get(), 915 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 916 /* assign the pointer */ 917 matstructT->mat = matrixT; 918 ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 919 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 920 /* First convert HYB to CSR */ 921 CsrMatrix *temp= new CsrMatrix; 922 temp->num_rows = A->rmap->n; 923 temp->num_cols = A->cmap->n; 924 temp->num_entries = a->nz; 925 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 926 temp->column_indices = new THRUSTINTARRAY32(a->nz); 927 temp->values = new THRUSTARRAY(a->nz); 928 929 930 stat = cusparse_hyb2csr(cusparsestruct->handle, 931 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 932 temp->values->data().get(), 933 temp->row_offsets->data().get(), 934 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 935 936 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 937 CsrMatrix *tempT= new CsrMatrix; 938 tempT->num_rows = A->rmap->n; 939 tempT->num_cols = A->cmap->n; 940 tempT->num_entries = a->nz; 941 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 942 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 943 tempT->values = new THRUSTARRAY(a->nz); 944 945 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 946 temp->num_cols, temp->num_entries, 947 temp->values->data().get(), 948 temp->row_offsets->data().get(), 949 temp->column_indices->data().get(), 950 tempT->values->data().get(), 951 tempT->column_indices->data().get(), 952 tempT->row_offsets->data().get(), 953 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 954 955 /* Last, convert CSC to HYB */ 956 cusparseHybMat_t hybMat; 957 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 958 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 959 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 960 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 961 matstructT->descr, tempT->values->data().get(), 962 tempT->row_offsets->data().get(), 963 tempT->column_indices->data().get(), 964 hybMat, 0, partition);CHKERRCUSPARSE(stat); 965 966 /* assign the pointer */ 967 matstructT->mat = hybMat; 968 ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr); 969 970 /* delete temporaries */ 971 if (tempT) { 972 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 973 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 974 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 975 delete (CsrMatrix*) tempT; 976 } 977 if (temp) { 978 if (temp->values) delete (THRUSTARRAY*) temp->values; 979 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 980 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 981 delete (CsrMatrix*) temp; 982 } 983 } 984 /* the compressed row indices is not used for matTranspose */ 985 matstructT->cprowIndices = NULL; 986 /* assign the pointer */ 987 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 988 PetscFunctionReturn(0); 989 } 990 991 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 992 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 993 { 994 PetscInt n = xx->map->n; 995 const PetscScalar *barray; 996 PetscScalar *xarray; 997 thrust::device_ptr<const PetscScalar> bGPU; 998 thrust::device_ptr<PetscScalar> xGPU; 999 cusparseStatus_t stat; 1000 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1001 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1002 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1003 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1004 PetscErrorCode ierr; 1005 cudaError_t cerr; 1006 1007 PetscFunctionBegin; 1008 /* Analyze the matrix and create the transpose ... on the fly */ 1009 if (!loTriFactorT && !upTriFactorT) { 1010 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1011 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1012 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1013 } 1014 1015 /* Get the GPU pointers */ 1016 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1017 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1018 xGPU = thrust::device_pointer_cast(xarray); 1019 bGPU = thrust::device_pointer_cast(barray); 1020 1021 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1022 /* First, reorder with the row permutation */ 1023 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1024 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1025 xGPU); 1026 1027 /* First, solve U */ 1028 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1029 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1030 upTriFactorT->csrMat->values->data().get(), 1031 upTriFactorT->csrMat->row_offsets->data().get(), 1032 upTriFactorT->csrMat->column_indices->data().get(), 1033 upTriFactorT->solveInfo, 1034 xarray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1035 1036 /* Then, solve L */ 1037 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1038 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1039 loTriFactorT->csrMat->values->data().get(), 1040 loTriFactorT->csrMat->row_offsets->data().get(), 1041 loTriFactorT->csrMat->column_indices->data().get(), 1042 loTriFactorT->solveInfo, 1043 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1044 1045 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1046 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1047 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1048 tempGPU->begin()); 1049 1050 /* Copy the temporary to the full solution. */ 1051 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1052 1053 /* restore */ 1054 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1055 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1056 cerr = WaitForGPU();CHKERRCUDA(cerr); 1057 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1058 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1059 PetscFunctionReturn(0); 1060 } 1061 1062 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1063 { 1064 const PetscScalar *barray; 1065 PetscScalar *xarray; 1066 cusparseStatus_t stat; 1067 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1068 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1069 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1070 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1071 PetscErrorCode ierr; 1072 cudaError_t cerr; 1073 1074 PetscFunctionBegin; 1075 /* Analyze the matrix and create the transpose ... on the fly */ 1076 if (!loTriFactorT && !upTriFactorT) { 1077 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1078 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1079 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1080 } 1081 1082 /* Get the GPU pointers */ 1083 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1084 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1085 1086 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1087 /* First, solve U */ 1088 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1089 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1090 upTriFactorT->csrMat->values->data().get(), 1091 upTriFactorT->csrMat->row_offsets->data().get(), 1092 upTriFactorT->csrMat->column_indices->data().get(), 1093 upTriFactorT->solveInfo, 1094 barray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1095 1096 /* Then, solve L */ 1097 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1098 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1099 loTriFactorT->csrMat->values->data().get(), 1100 loTriFactorT->csrMat->row_offsets->data().get(), 1101 loTriFactorT->csrMat->column_indices->data().get(), 1102 loTriFactorT->solveInfo, 1103 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1104 1105 /* restore */ 1106 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1107 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1108 cerr = WaitForGPU();CHKERRCUDA(cerr); 1109 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1110 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1111 PetscFunctionReturn(0); 1112 } 1113 1114 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1115 { 1116 const PetscScalar *barray; 1117 PetscScalar *xarray; 1118 thrust::device_ptr<const PetscScalar> bGPU; 1119 thrust::device_ptr<PetscScalar> xGPU; 1120 cusparseStatus_t stat; 1121 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1122 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1123 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1124 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1125 PetscErrorCode ierr; 1126 cudaError_t cerr; 1127 1128 PetscFunctionBegin; 1129 1130 /* Get the GPU pointers */ 1131 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1132 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1133 xGPU = thrust::device_pointer_cast(xarray); 1134 bGPU = thrust::device_pointer_cast(barray); 1135 1136 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1137 /* First, reorder with the row permutation */ 1138 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1139 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1140 tempGPU->begin()); 1141 1142 /* Next, solve L */ 1143 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1144 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1145 loTriFactor->csrMat->values->data().get(), 1146 loTriFactor->csrMat->row_offsets->data().get(), 1147 loTriFactor->csrMat->column_indices->data().get(), 1148 loTriFactor->solveInfo, 1149 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1150 1151 /* Then, solve U */ 1152 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1153 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1154 upTriFactor->csrMat->values->data().get(), 1155 upTriFactor->csrMat->row_offsets->data().get(), 1156 upTriFactor->csrMat->column_indices->data().get(), 1157 upTriFactor->solveInfo, 1158 xarray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1159 1160 /* Last, reorder with the column permutation */ 1161 thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1162 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1163 xGPU); 1164 1165 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1166 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1167 cerr = WaitForGPU();CHKERRCUDA(cerr); 1168 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1169 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1170 PetscFunctionReturn(0); 1171 } 1172 1173 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1174 { 1175 const PetscScalar *barray; 1176 PetscScalar *xarray; 1177 cusparseStatus_t stat; 1178 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1179 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1180 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1181 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1182 PetscErrorCode ierr; 1183 cudaError_t cerr; 1184 1185 PetscFunctionBegin; 1186 /* Get the GPU pointers */ 1187 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1188 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1189 1190 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1191 /* First, solve L */ 1192 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1193 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1194 loTriFactor->csrMat->values->data().get(), 1195 loTriFactor->csrMat->row_offsets->data().get(), 1196 loTriFactor->csrMat->column_indices->data().get(), 1197 loTriFactor->solveInfo, 1198 barray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1199 1200 /* Next, solve U */ 1201 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1202 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1203 upTriFactor->csrMat->values->data().get(), 1204 upTriFactor->csrMat->row_offsets->data().get(), 1205 upTriFactor->csrMat->column_indices->data().get(), 1206 upTriFactor->solveInfo, 1207 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1208 1209 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1210 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1211 cerr = WaitForGPU();CHKERRCUDA(cerr); 1212 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1213 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1214 PetscFunctionReturn(0); 1215 } 1216 1217 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1218 { 1219 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1220 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1221 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1222 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1223 PetscErrorCode ierr; 1224 cusparseStatus_t stat; 1225 cudaError_t err; 1226 1227 PetscFunctionBegin; 1228 if (A->boundtocpu) PetscFunctionReturn(0); 1229 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1230 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1231 if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1232 /* Copy values only */ 1233 CsrMatrix *mat,*matT; 1234 mat = (CsrMatrix*)cusparsestruct->mat->mat; 1235 mat->values->assign(a->a, a->a+a->nz); 1236 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1237 1238 /* Update matT when it was built before */ 1239 if (cusparsestruct->matTranspose) { 1240 cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr); 1241 matT = (CsrMatrix*)cusparsestruct->matTranspose->mat; 1242 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1243 A->cmap->n, mat->num_entries, 1244 mat->values->data().get(), 1245 cusparsestruct->rowoffsets_gpu->data().get(), 1246 mat->column_indices->data().get(), 1247 matT->values->data().get(), 1248 matT->column_indices->data().get(), 1249 matT->row_offsets->data().get(), 1250 CUSPARSE_ACTION_NUMERIC,indexBase);CHKERRCUSPARSE(stat); 1251 } 1252 } else { 1253 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1254 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr); 1255 delete cusparsestruct->workVector; 1256 delete cusparsestruct->rowoffsets_gpu; 1257 try { 1258 if (a->compressedrow.use) { 1259 m = a->compressedrow.nrows; 1260 ii = a->compressedrow.i; 1261 ridx = a->compressedrow.rindex; 1262 } else { 1263 m = A->rmap->n; 1264 ii = a->i; 1265 ridx = NULL; /* In this case, one should not dereference ridx */ 1266 } 1267 cusparsestruct->nrows = m; 1268 1269 /* allocate space for the triangular factor information */ 1270 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1271 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1272 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1273 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1274 1275 err = cudaMalloc((void **)&(matstruct->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 1276 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1277 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1278 err = cudaMemcpy(matstruct->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1279 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1280 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1281 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1282 1283 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1284 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1285 /* set the matrix */ 1286 CsrMatrix *matrix= new CsrMatrix; 1287 matrix->num_rows = m; 1288 matrix->num_cols = A->cmap->n; 1289 matrix->num_entries = a->nz; 1290 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1291 matrix->row_offsets->assign(ii, ii + m+1); 1292 1293 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1294 matrix->column_indices->assign(a->j, a->j+a->nz); 1295 1296 matrix->values = new THRUSTARRAY(a->nz); 1297 matrix->values->assign(a->a, a->a+a->nz); 1298 1299 /* assign the pointer */ 1300 matstruct->mat = matrix; 1301 1302 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1303 CsrMatrix *matrix= new CsrMatrix; 1304 matrix->num_rows = m; 1305 matrix->num_cols = A->cmap->n; 1306 matrix->num_entries = a->nz; 1307 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1308 matrix->row_offsets->assign(ii, ii + m+1); 1309 1310 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1311 matrix->column_indices->assign(a->j, a->j+a->nz); 1312 1313 matrix->values = new THRUSTARRAY(a->nz); 1314 matrix->values->assign(a->a, a->a+a->nz); 1315 1316 cusparseHybMat_t hybMat; 1317 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1318 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1319 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1320 stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1321 matstruct->descr, matrix->values->data().get(), 1322 matrix->row_offsets->data().get(), 1323 matrix->column_indices->data().get(), 1324 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1325 /* assign the pointer */ 1326 matstruct->mat = hybMat; 1327 1328 if (matrix) { 1329 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1330 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1331 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1332 delete (CsrMatrix*)matrix; 1333 } 1334 } 1335 1336 /* assign the compressed row indices */ 1337 if (a->compressedrow.use) { 1338 cusparsestruct->workVector = new THRUSTARRAY(m); 1339 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1340 matstruct->cprowIndices->assign(ridx,ridx+m); 1341 tmp = m; 1342 } else { 1343 cusparsestruct->workVector = NULL; 1344 matstruct->cprowIndices = NULL; 1345 tmp = 0; 1346 } 1347 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1348 1349 /* assign the pointer */ 1350 cusparsestruct->mat = matstruct; 1351 } catch(char *ex) { 1352 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1353 } 1354 cusparsestruct->nonzerostate = A->nonzerostate; 1355 } 1356 err = WaitForGPU();CHKERRCUDA(err); 1357 A->offloadmask = PETSC_OFFLOAD_BOTH; 1358 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1359 } 1360 PetscFunctionReturn(0); 1361 } 1362 1363 struct VecCUDAPlusEquals 1364 { 1365 template <typename Tuple> 1366 __host__ __device__ 1367 void operator()(Tuple t) 1368 { 1369 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1370 } 1371 }; 1372 1373 typedef struct { 1374 PetscBool cisdense; 1375 PetscScalar *Bt; 1376 Mat X; 1377 } MatMatCusparse; 1378 1379 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1380 { 1381 PetscErrorCode ierr; 1382 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1383 cudaError_t cerr; 1384 1385 PetscFunctionBegin; 1386 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1387 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1388 ierr = PetscFree(data);CHKERRQ(ierr); 1389 PetscFunctionReturn(0); 1390 } 1391 1392 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1393 1394 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1395 { 1396 Mat_Product *product = C->product; 1397 Mat A,B; 1398 PetscInt m,n,k,blda,clda; 1399 PetscBool flg,biscuda; 1400 Mat_SeqAIJCUSPARSE *cusp; 1401 cusparseStatus_t stat; 1402 cusparseOperation_t opA; 1403 cusparseMatDescr_t matA; 1404 const PetscScalar *barray; 1405 PetscScalar *carray; 1406 PetscErrorCode ierr; 1407 MatMatCusparse *mmdata; 1408 Mat_SeqAIJCUSPARSEMultStruct *mat; 1409 CsrMatrix *csrmat; 1410 1411 PetscFunctionBegin; 1412 MatCheckProduct(C,1); 1413 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 1414 mmdata = (MatMatCusparse*)product->data; 1415 A = product->A; 1416 B = product->B; 1417 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1418 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1419 /* currently CopyToGpu does not copy if the matrix is bound to CPU 1420 Instead of silently accepting the wrong answer, I prefer to raise the error */ 1421 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 1422 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1423 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1424 switch (product->type) { 1425 case MATPRODUCT_AB: 1426 case MATPRODUCT_PtAP: 1427 mat = cusp->mat; 1428 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1429 m = A->rmap->n; 1430 k = A->cmap->n; 1431 n = B->cmap->n; 1432 break; 1433 case MATPRODUCT_AtB: 1434 if (!cusp->matTranspose) { 1435 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1436 } 1437 mat = cusp->matTranspose; 1438 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1439 m = A->cmap->n; 1440 k = A->rmap->n; 1441 n = B->cmap->n; 1442 break; 1443 case MATPRODUCT_ABt: 1444 case MATPRODUCT_RARt: 1445 mat = cusp->mat; 1446 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1447 m = A->rmap->n; 1448 k = B->cmap->n; 1449 n = B->rmap->n; 1450 break; 1451 default: 1452 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1453 } 1454 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 1455 matA = mat->descr; 1456 csrmat = (CsrMatrix*)mat->mat; 1457 /* if the user passed a CPU matrix, copy the data to the GPU */ 1458 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 1459 if (!biscuda) { 1460 ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 1461 } 1462 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 1463 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 1464 /* cusparse spmm does not support transpose on B */ 1465 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 1466 cublasHandle_t cublasv2handle; 1467 cublasStatus_t cerr; 1468 1469 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 1470 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 1471 B->cmap->n,B->rmap->n, 1472 &PETSC_CUSPARSE_ONE ,barray,blda, 1473 &PETSC_CUSPARSE_ZERO,barray,blda, 1474 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 1475 blda = B->cmap->n; 1476 } 1477 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 1478 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1479 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 1480 } else { 1481 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 1482 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 1483 } 1484 1485 /* perform the MatMat operation */ 1486 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 1487 csrmat->num_entries,mat->alpha,matA, 1488 csrmat->values->data().get(), 1489 csrmat->row_offsets->data().get(), 1490 csrmat->column_indices->data().get(), 1491 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 1492 carray,clda);CHKERRCUSPARSE(stat); 1493 1494 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 1495 if (product->type == MATPRODUCT_RARt) { 1496 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1497 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 1498 } else if (product->type == MATPRODUCT_PtAP) { 1499 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1500 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 1501 } else { 1502 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 1503 } 1504 if (mmdata->cisdense) { 1505 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 1506 } 1507 if (!biscuda) { 1508 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 1509 } 1510 PetscFunctionReturn(0); 1511 } 1512 1513 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1514 { 1515 Mat_Product *product = C->product; 1516 Mat A,B; 1517 PetscInt m,n; 1518 PetscBool cisdense,flg; 1519 PetscErrorCode ierr; 1520 MatMatCusparse *mmdata; 1521 Mat_SeqAIJCUSPARSE *cusp; 1522 1523 PetscFunctionBegin; 1524 MatCheckProduct(C,1); 1525 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 1526 A = product->A; 1527 B = product->B; 1528 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1529 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1530 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1531 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 1532 switch (product->type) { 1533 case MATPRODUCT_AB: 1534 m = A->rmap->n; 1535 n = B->cmap->n; 1536 break; 1537 case MATPRODUCT_AtB: 1538 m = A->cmap->n; 1539 n = B->cmap->n; 1540 break; 1541 case MATPRODUCT_ABt: 1542 m = A->rmap->n; 1543 n = B->rmap->n; 1544 break; 1545 case MATPRODUCT_PtAP: 1546 m = B->cmap->n; 1547 n = B->cmap->n; 1548 break; 1549 case MATPRODUCT_RARt: 1550 m = B->rmap->n; 1551 n = B->rmap->n; 1552 break; 1553 default: 1554 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1555 } 1556 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 1557 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 1558 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 1559 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 1560 1561 /* product data */ 1562 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 1563 mmdata->cisdense = cisdense; 1564 /* cusparse spmm does not support transpose on B */ 1565 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 1566 cudaError_t cerr; 1567 1568 cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 1569 } 1570 /* for these products we need intermediate storage */ 1571 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 1572 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 1573 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 1574 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 1575 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 1576 } else { 1577 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 1578 } 1579 } 1580 C->product->data = mmdata; 1581 C->product->destroy = MatDestroy_MatMatCusparse; 1582 1583 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 1584 PetscFunctionReturn(0); 1585 } 1586 1587 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 1588 1589 /* handles dense B */ 1590 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C) 1591 { 1592 Mat_Product *product = C->product; 1593 PetscErrorCode ierr; 1594 1595 PetscFunctionBegin; 1596 MatCheckProduct(C,1); 1597 if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A"); 1598 if (product->A->boundtocpu) { 1599 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr); 1600 PetscFunctionReturn(0); 1601 } 1602 switch (product->type) { 1603 case MATPRODUCT_AB: 1604 case MATPRODUCT_AtB: 1605 case MATPRODUCT_ABt: 1606 case MATPRODUCT_PtAP: 1607 case MATPRODUCT_RARt: 1608 C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 1609 default: 1610 break; 1611 } 1612 PetscFunctionReturn(0); 1613 } 1614 1615 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1616 { 1617 PetscErrorCode ierr; 1618 1619 PetscFunctionBegin; 1620 ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr); 1621 PetscFunctionReturn(0); 1622 } 1623 1624 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1625 { 1626 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1627 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1628 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1629 const PetscScalar *xarray; 1630 PetscScalar *yarray; 1631 PetscErrorCode ierr; 1632 cudaError_t cerr; 1633 cusparseStatus_t stat; 1634 1635 PetscFunctionBegin; 1636 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1637 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1638 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1639 if (!matstructT) { 1640 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1641 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1642 } 1643 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1644 ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1645 if (yy->map->n) { 1646 PetscInt n = yy->map->n; 1647 cudaError_t err; 1648 err = cudaMemset(yarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err); /* hack to fix numerical errors from reading output vector yy, apparently */ 1649 } 1650 1651 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1652 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1653 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1654 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1655 mat->num_rows, mat->num_cols, 1656 mat->num_entries, matstructT->alpha, matstructT->descr, 1657 mat->values->data().get(), mat->row_offsets->data().get(), 1658 mat->column_indices->data().get(), xarray, matstructT->beta_zero, 1659 yarray);CHKERRCUSPARSE(stat); 1660 } else { 1661 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1662 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1663 matstructT->alpha, matstructT->descr, hybMat, 1664 xarray, matstructT->beta_zero, 1665 yarray);CHKERRCUSPARSE(stat); 1666 } 1667 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1668 ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1669 cerr = WaitForGPU();CHKERRCUDA(cerr); 1670 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1671 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1672 PetscFunctionReturn(0); 1673 } 1674 1675 1676 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1677 { 1678 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1679 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1680 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 1681 const PetscScalar *xarray; 1682 PetscScalar *zarray,*dptr,*beta; 1683 PetscErrorCode ierr; 1684 cudaError_t cerr; 1685 cusparseStatus_t stat; 1686 PetscBool compressed = a->compressedrow.use; /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 1687 1688 PetscFunctionBegin; 1689 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1690 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1691 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1692 try { 1693 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1694 1695 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 1696 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 1697 1698 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; /* Use a work vector for SpMv result if compressed */ 1699 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 1700 1701 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1702 /* csr_spmv does y = alpha*Ax + beta*y */ 1703 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1704 /* here we need to be careful to set the number of rows in the multiply to the 1705 number of compressed rows in the matrix ... which is equivalent to the 1706 size of the workVector */ 1707 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1708 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1709 mat->num_rows, mat->num_cols, 1710 mat->num_entries, matstruct->alpha, matstruct->descr, 1711 mat->values->data().get(), mat->row_offsets->data().get(), 1712 mat->column_indices->data().get(), xarray, beta, 1713 dptr);CHKERRCUSPARSE(stat); 1714 } else { 1715 if (cusparsestruct->nrows) { 1716 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1717 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1718 matstruct->alpha, matstruct->descr, hybMat, 1719 xarray, beta, 1720 dptr);CHKERRCUSPARSE(stat); 1721 } 1722 } 1723 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1724 1725 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 1726 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 1727 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 1728 } else if (zz != yy) { /* A's not compreseed. zz already contains A*xx, and we just need to add yy */ 1729 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 1730 } 1731 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 1732 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 1733 } 1734 1735 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 1736 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1737 if (compressed) { 1738 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 1739 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 1740 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1741 VecCUDAPlusEquals()); 1742 } 1743 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1744 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1745 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 1746 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 1747 } catch(char *ex) { 1748 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1749 } 1750 cerr = WaitForGPU();CHKERRCUDA(cerr); 1751 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1752 PetscFunctionReturn(0); 1753 } 1754 1755 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1756 { 1757 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1758 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1759 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1760 const PetscScalar *xarray; 1761 PetscScalar *zarray,*beta; 1762 PetscErrorCode ierr; 1763 cudaError_t cerr; 1764 cusparseStatus_t stat; 1765 1766 PetscFunctionBegin; 1767 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1768 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1769 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1770 if (!matstructT) { 1771 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1772 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1773 } 1774 1775 /* Note unlike Mat, MatTranspose uses non-compressed row storage */ 1776 try { 1777 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1778 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1779 ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr); 1780 beta = (yy == zz) ? matstructT->beta_one : matstructT->beta_zero; 1781 1782 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1783 /* multiply add with matrix transpose */ 1784 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1785 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1786 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1787 mat->num_rows, mat->num_cols, 1788 mat->num_entries, matstructT->alpha, matstructT->descr, 1789 mat->values->data().get(), mat->row_offsets->data().get(), 1790 mat->column_indices->data().get(), xarray, beta, 1791 zarray);CHKERRCUSPARSE(stat); 1792 } else { 1793 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1794 if (cusparsestruct->nrows) { 1795 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1796 matstructT->alpha, matstructT->descr, hybMat, 1797 xarray, beta, 1798 zarray);CHKERRCUSPARSE(stat); 1799 } 1800 } 1801 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1802 1803 if (zz != yy) {ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr);} 1804 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1805 ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr); 1806 } catch(char *ex) { 1807 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1808 } 1809 cerr = WaitForGPU();CHKERRCUDA(cerr); 1810 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1811 PetscFunctionReturn(0); 1812 } 1813 1814 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1815 { 1816 PetscErrorCode ierr; 1817 1818 PetscFunctionBegin; 1819 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1820 if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 1821 if (A->factortype == MAT_FACTOR_NONE) { 1822 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1823 } 1824 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1825 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1826 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1827 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1828 PetscFunctionReturn(0); 1829 } 1830 1831 /* --------------------------------------------------------------------------------*/ 1832 /*@ 1833 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1834 (the default parallel PETSc format). This matrix will ultimately pushed down 1835 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1836 assembly performance the user should preallocate the matrix storage by setting 1837 the parameter nz (or the array nnz). By setting these parameters accurately, 1838 performance during matrix assembly can be increased by more than a factor of 50. 1839 1840 Collective 1841 1842 Input Parameters: 1843 + comm - MPI communicator, set to PETSC_COMM_SELF 1844 . m - number of rows 1845 . n - number of columns 1846 . nz - number of nonzeros per row (same for all rows) 1847 - nnz - array containing the number of nonzeros in the various rows 1848 (possibly different for each row) or NULL 1849 1850 Output Parameter: 1851 . A - the matrix 1852 1853 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1854 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1855 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1856 1857 Notes: 1858 If nnz is given then nz is ignored 1859 1860 The AIJ format (also called the Yale sparse matrix format or 1861 compressed row storage), is fully compatible with standard Fortran 77 1862 storage. That is, the stored row and column indices can begin at 1863 either one (as in Fortran) or zero. See the users' manual for details. 1864 1865 Specify the preallocated storage with either nz or nnz (not both). 1866 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1867 allocation. For large problems you MUST preallocate memory or you 1868 will get TERRIBLE performance, see the users' manual chapter on matrices. 1869 1870 By default, this format uses inodes (identical nodes) when possible, to 1871 improve numerical efficiency of matrix-vector products and solves. We 1872 search for consecutive rows with the same nonzero structure, thereby 1873 reusing matrix information to achieve increased efficiency. 1874 1875 Level: intermediate 1876 1877 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1878 @*/ 1879 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1880 { 1881 PetscErrorCode ierr; 1882 1883 PetscFunctionBegin; 1884 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1885 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1886 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1887 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1888 PetscFunctionReturn(0); 1889 } 1890 1891 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1892 { 1893 PetscErrorCode ierr; 1894 1895 PetscFunctionBegin; 1896 if (A->factortype == MAT_FACTOR_NONE) { 1897 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) { 1898 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 1899 } 1900 } else { 1901 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 1902 } 1903 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 1904 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 1905 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 1906 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 1907 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1908 PetscFunctionReturn(0); 1909 } 1910 1911 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 1912 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 1913 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 1914 { 1915 PetscErrorCode ierr; 1916 1917 PetscFunctionBegin; 1918 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 1919 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 1920 PetscFunctionReturn(0); 1921 } 1922 1923 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 1924 { 1925 PetscFunctionBegin; 1926 /* Currently, there is no case in which an AIJCUSPARSE matrix ever has its offloadmask set to PETS_OFFLOAD_GPU. 1927 If this changes, we need to implement a routine to update the CPU (host) version of the matrix from the GPU one. 1928 Right now, for safety we simply check for PETSC_OFFLOAD_GPU and have MatBindToCPU() do nothing in this case. 1929 TODO: Add MatAIJCUSPARSECopyFromGPU() and make MatBindToCPU() functional for AIJCUSPARSE matries; 1930 can follow the example of MatBindToCPU_SeqAIJViennaCL(). */ 1931 /* We need to take care of function composition also */ 1932 if (A->offloadmask == PETSC_OFFLOAD_GPU) PetscFunctionReturn(0); 1933 if (flg) { 1934 A->ops->mult = MatMult_SeqAIJ; 1935 A->ops->multadd = MatMultAdd_SeqAIJ; 1936 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 1937 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 1938 A->ops->assemblyend = MatAssemblyEnd_SeqAIJ; 1939 A->ops->duplicate = MatDuplicate_SeqAIJ; 1940 } else { 1941 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1942 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1943 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1944 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1945 A->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1946 A->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1947 A->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1948 } 1949 A->boundtocpu = flg; 1950 PetscFunctionReturn(0); 1951 } 1952 1953 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 1954 { 1955 PetscErrorCode ierr; 1956 cusparseStatus_t stat; 1957 cusparseHandle_t handle=0; 1958 1959 PetscFunctionBegin; 1960 if (reuse != MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not yet supported"); 1961 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 1962 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 1963 1964 if (B->factortype == MAT_FACTOR_NONE) { 1965 /* you cannot check the inode.use flag here since the matrix was just created. 1966 now build a GPU matrix data structure */ 1967 B->spptr = new Mat_SeqAIJCUSPARSE; 1968 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1969 ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0; 1970 ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0; 1971 ((Mat_SeqAIJCUSPARSE*)B->spptr)->rowoffsets_gpu = 0; 1972 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1973 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1974 stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat); 1975 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle; 1976 ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0; 1977 } else { 1978 /* NEXT, set the pointers to the triangular factors */ 1979 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1980 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 1981 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1982 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0; 1983 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0; 1984 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0; 1985 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0; 1986 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0; 1987 stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat); 1988 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle; 1989 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0; 1990 } 1991 1992 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1993 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1994 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1995 B->ops->mult = MatMult_SeqAIJCUSPARSE; 1996 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1997 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1998 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1999 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 2000 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 2001 2002 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2003 2004 B->boundtocpu = PETSC_FALSE; 2005 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2006 2007 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 2008 ierr = PetscObjectComposeFunction((PetscObject)B,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 2009 ierr = PetscObjectComposeFunction((PetscObject)B,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 2010 PetscFunctionReturn(0); 2011 } 2012 2013 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 2014 { 2015 PetscErrorCode ierr; 2016 2017 PetscFunctionBegin; 2018 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 2019 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2020 PetscFunctionReturn(0); 2021 } 2022 2023 /*MC 2024 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 2025 2026 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 2027 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 2028 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 2029 2030 Options Database Keys: 2031 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 2032 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2033 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2034 2035 Level: beginner 2036 2037 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 2038 M*/ 2039 2040 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 2041 2042 2043 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 2044 { 2045 PetscErrorCode ierr; 2046 2047 PetscFunctionBegin; 2048 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2049 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2050 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2051 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2052 PetscFunctionReturn(0); 2053 } 2054 2055 2056 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 2057 { 2058 cusparseStatus_t stat; 2059 cusparseHandle_t handle; 2060 2061 PetscFunctionBegin; 2062 if (*cusparsestruct) { 2063 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format); 2064 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format); 2065 delete (*cusparsestruct)->workVector; 2066 delete (*cusparsestruct)->rowoffsets_gpu; 2067 if (handle = (*cusparsestruct)->handle) { 2068 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 2069 } 2070 delete *cusparsestruct; 2071 *cusparsestruct = 0; 2072 } 2073 PetscFunctionReturn(0); 2074 } 2075 2076 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 2077 { 2078 PetscFunctionBegin; 2079 if (*mat) { 2080 delete (*mat)->values; 2081 delete (*mat)->column_indices; 2082 delete (*mat)->row_offsets; 2083 delete *mat; 2084 *mat = 0; 2085 } 2086 PetscFunctionReturn(0); 2087 } 2088 2089 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 2090 { 2091 cusparseStatus_t stat; 2092 PetscErrorCode ierr; 2093 2094 PetscFunctionBegin; 2095 if (*trifactor) { 2096 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 2097 if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 2098 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 2099 delete *trifactor; 2100 *trifactor = 0; 2101 } 2102 PetscFunctionReturn(0); 2103 } 2104 2105 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 2106 { 2107 CsrMatrix *mat; 2108 cusparseStatus_t stat; 2109 cudaError_t err; 2110 2111 PetscFunctionBegin; 2112 if (*matstruct) { 2113 if ((*matstruct)->mat) { 2114 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 2115 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 2116 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 2117 } else { 2118 mat = (CsrMatrix*)(*matstruct)->mat; 2119 CsrMatrix_Destroy(&mat); 2120 } 2121 } 2122 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 2123 delete (*matstruct)->cprowIndices; 2124 if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); } 2125 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 2126 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 2127 delete *matstruct; 2128 *matstruct = 0; 2129 } 2130 PetscFunctionReturn(0); 2131 } 2132 2133 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors) 2134 { 2135 PetscFunctionBegin; 2136 if (*trifactors) { 2137 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr); 2138 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr); 2139 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose); 2140 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose); 2141 delete (*trifactors)->rpermIndices; 2142 delete (*trifactors)->cpermIndices; 2143 delete (*trifactors)->workVector; 2144 (*trifactors)->rpermIndices = 0; 2145 (*trifactors)->cpermIndices = 0; 2146 (*trifactors)->workVector = 0; 2147 } 2148 PetscFunctionReturn(0); 2149 } 2150 2151 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 2152 { 2153 cusparseHandle_t handle; 2154 cusparseStatus_t stat; 2155 2156 PetscFunctionBegin; 2157 if (*trifactors) { 2158 MatSeqAIJCUSPARSETriFactors_Reset(trifactors); 2159 if (handle = (*trifactors)->handle) { 2160 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 2161 } 2162 delete *trifactors; 2163 *trifactors = 0; 2164 } 2165 PetscFunctionReturn(0); 2166 } 2167 2168