1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_CXX_COMPLEX_FIX 7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 8 9 #include <petscconf.h> 10 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11 #include <../src/mat/impls/sbaij/seq/sbaij.h> 12 #include <../src/vec/vec/impls/dvecimpl.h> 13 #include <petsc/private/vecimpl.h> 14 #undef VecType 15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 19 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 20 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 21 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 22 23 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 24 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 25 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 26 27 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 28 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 29 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 30 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 31 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 32 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 33 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 34 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 35 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 36 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 37 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 38 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 39 40 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 41 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 42 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 43 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**); 44 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 45 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 46 47 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 48 { 49 cusparseStatus_t stat; 50 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 51 52 PetscFunctionBegin; 53 cusparsestruct->stream = stream; 54 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 55 PetscFunctionReturn(0); 56 } 57 58 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 59 { 60 cusparseStatus_t stat; 61 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 62 63 PetscFunctionBegin; 64 if (cusparsestruct->handle != handle) { 65 if (cusparsestruct->handle) { 66 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 67 } 68 cusparsestruct->handle = handle; 69 } 70 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 71 PetscFunctionReturn(0); 72 } 73 74 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 75 { 76 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 77 78 PetscFunctionBegin; 79 if (cusparsestruct->handle) cusparsestruct->handle = 0; 80 PetscFunctionReturn(0); 81 } 82 83 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 84 { 85 PetscFunctionBegin; 86 *type = MATSOLVERCUSPARSE; 87 PetscFunctionReturn(0); 88 } 89 90 /*MC 91 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 92 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 93 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 94 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 95 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 96 algorithms are not recommended. This class does NOT support direct solver operations. 97 98 Level: beginner 99 100 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 101 M*/ 102 103 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 104 { 105 PetscErrorCode ierr; 106 PetscInt n = A->rmap->n; 107 108 PetscFunctionBegin; 109 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 110 (*B)->factortype = ftype; 111 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 112 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 113 114 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 115 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 116 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 117 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 118 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 119 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 120 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 121 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 122 123 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 124 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 125 PetscFunctionReturn(0); 126 } 127 128 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 129 { 130 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 131 132 PetscFunctionBegin; 133 switch (op) { 134 case MAT_CUSPARSE_MULT: 135 cusparsestruct->format = format; 136 break; 137 case MAT_CUSPARSE_ALL: 138 cusparsestruct->format = format; 139 break; 140 default: 141 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 142 } 143 PetscFunctionReturn(0); 144 } 145 146 /*@ 147 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 148 operation. Only the MatMult operation can use different GPU storage formats 149 for MPIAIJCUSPARSE matrices. 150 Not Collective 151 152 Input Parameters: 153 + A - Matrix of type SEQAIJCUSPARSE 154 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 155 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 156 157 Output Parameter: 158 159 Level: intermediate 160 161 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 162 @*/ 163 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 164 { 165 PetscErrorCode ierr; 166 167 PetscFunctionBegin; 168 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 169 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 170 PetscFunctionReturn(0); 171 } 172 173 /*@ 174 MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose 175 176 Collective on mat 177 178 Input Parameters: 179 + A - Matrix of type SEQAIJCUSPARSE 180 - transgen - the boolean flag 181 182 Level: intermediate 183 184 .seealso: MATSEQAIJCUSPARSE 185 @*/ 186 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen) 187 { 188 PetscErrorCode ierr; 189 PetscBool flg; 190 191 PetscFunctionBegin; 192 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 193 ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 194 if (flg) { 195 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 196 197 if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 198 cusp->transgen = transgen; 199 if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */ 200 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 201 } 202 } 203 PetscFunctionReturn(0); 204 } 205 206 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 207 { 208 PetscErrorCode ierr; 209 MatCUSPARSEStorageFormat format; 210 PetscBool flg; 211 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 212 213 PetscFunctionBegin; 214 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 215 if (A->factortype == MAT_FACTOR_NONE) { 216 PetscBool transgen = cusparsestruct->transgen; 217 218 ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr); 219 if (flg) { 220 ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr); 221 } 222 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 223 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 224 if (flg) { 225 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 226 } 227 } 228 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 229 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 230 if (flg) { 231 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 232 } 233 ierr = PetscOptionsTail();CHKERRQ(ierr); 234 PetscFunctionReturn(0); 235 } 236 237 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 238 { 239 PetscErrorCode ierr; 240 241 PetscFunctionBegin; 242 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 243 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 244 PetscFunctionReturn(0); 245 } 246 247 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 248 { 249 PetscErrorCode ierr; 250 251 PetscFunctionBegin; 252 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 253 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 254 PetscFunctionReturn(0); 255 } 256 257 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 258 { 259 PetscErrorCode ierr; 260 261 PetscFunctionBegin; 262 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 263 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 264 PetscFunctionReturn(0); 265 } 266 267 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 268 { 269 PetscErrorCode ierr; 270 271 PetscFunctionBegin; 272 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 273 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 274 PetscFunctionReturn(0); 275 } 276 277 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 278 { 279 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 280 PetscInt n = A->rmap->n; 281 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 282 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 283 cusparseStatus_t stat; 284 const PetscInt *ai = a->i,*aj = a->j,*vi; 285 const MatScalar *aa = a->a,*v; 286 PetscInt *AiLo, *AjLo; 287 PetscScalar *AALo; 288 PetscInt i,nz, nzLower, offset, rowOffset; 289 PetscErrorCode ierr; 290 cudaError_t cerr; 291 292 PetscFunctionBegin; 293 if (!n) PetscFunctionReturn(0); 294 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 295 try { 296 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 297 nzLower=n+ai[n]-ai[1]; 298 299 /* Allocate Space for the lower triangular matrix */ 300 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 301 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 302 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 303 304 /* Fill the lower triangular matrix */ 305 AiLo[0] = (PetscInt) 0; 306 AiLo[n] = nzLower; 307 AjLo[0] = (PetscInt) 0; 308 AALo[0] = (MatScalar) 1.0; 309 v = aa; 310 vi = aj; 311 offset = 1; 312 rowOffset= 1; 313 for (i=1; i<n; i++) { 314 nz = ai[i+1] - ai[i]; 315 /* additional 1 for the term on the diagonal */ 316 AiLo[i] = rowOffset; 317 rowOffset += nz+1; 318 319 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 320 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 321 322 offset += nz; 323 AjLo[offset] = (PetscInt) i; 324 AALo[offset] = (MatScalar) 1.0; 325 offset += 1; 326 327 v += nz; 328 vi += nz; 329 } 330 331 /* allocate space for the triangular factor information */ 332 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 333 334 /* Create the matrix description */ 335 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 336 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 337 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 338 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 339 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 340 341 /* Create the solve analysis information */ 342 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 343 344 /* set the operation */ 345 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 346 347 /* set the matrix */ 348 loTriFactor->csrMat = new CsrMatrix; 349 loTriFactor->csrMat->num_rows = n; 350 loTriFactor->csrMat->num_cols = n; 351 loTriFactor->csrMat->num_entries = nzLower; 352 353 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 354 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 355 356 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 357 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 358 359 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 360 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 361 362 /* perform the solve analysis */ 363 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 364 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 365 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 366 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 367 368 /* assign the pointer. Is this really necessary? */ 369 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 370 371 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 372 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 373 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 374 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 375 } catch(char *ex) { 376 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 377 } 378 } 379 PetscFunctionReturn(0); 380 } 381 382 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 383 { 384 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 385 PetscInt n = A->rmap->n; 386 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 387 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 388 cusparseStatus_t stat; 389 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 390 const MatScalar *aa = a->a,*v; 391 PetscInt *AiUp, *AjUp; 392 PetscScalar *AAUp; 393 PetscInt i,nz, nzUpper, offset; 394 PetscErrorCode ierr; 395 cudaError_t cerr; 396 397 PetscFunctionBegin; 398 if (!n) PetscFunctionReturn(0); 399 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 400 try { 401 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 402 nzUpper = adiag[0]-adiag[n]; 403 404 /* Allocate Space for the upper triangular matrix */ 405 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 406 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 407 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 408 409 /* Fill the upper triangular matrix */ 410 AiUp[0]=(PetscInt) 0; 411 AiUp[n]=nzUpper; 412 offset = nzUpper; 413 for (i=n-1; i>=0; i--) { 414 v = aa + adiag[i+1] + 1; 415 vi = aj + adiag[i+1] + 1; 416 417 /* number of elements NOT on the diagonal */ 418 nz = adiag[i] - adiag[i+1]-1; 419 420 /* decrement the offset */ 421 offset -= (nz+1); 422 423 /* first, set the diagonal elements */ 424 AjUp[offset] = (PetscInt) i; 425 AAUp[offset] = (MatScalar)1./v[nz]; 426 AiUp[i] = AiUp[i+1] - (nz+1); 427 428 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 429 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 430 } 431 432 /* allocate space for the triangular factor information */ 433 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 434 435 /* Create the matrix description */ 436 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 437 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 438 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 439 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 440 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 441 442 /* Create the solve analysis information */ 443 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 444 445 /* set the operation */ 446 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 447 448 /* set the matrix */ 449 upTriFactor->csrMat = new CsrMatrix; 450 upTriFactor->csrMat->num_rows = n; 451 upTriFactor->csrMat->num_cols = n; 452 upTriFactor->csrMat->num_entries = nzUpper; 453 454 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 455 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 456 457 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 458 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 459 460 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 461 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 462 463 /* perform the solve analysis */ 464 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 465 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 466 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 467 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 468 469 /* assign the pointer. Is this really necessary? */ 470 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 471 472 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 473 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 474 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 475 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 476 } catch(char *ex) { 477 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 478 } 479 } 480 PetscFunctionReturn(0); 481 } 482 483 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 484 { 485 PetscErrorCode ierr; 486 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 487 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 488 IS isrow = a->row,iscol = a->icol; 489 PetscBool row_identity,col_identity; 490 const PetscInt *r,*c; 491 PetscInt n = A->rmap->n; 492 493 PetscFunctionBegin; 494 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 495 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 496 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 497 498 cusparseTriFactors->workVector = new THRUSTARRAY(n); 499 cusparseTriFactors->nnz=a->nz; 500 501 A->offloadmask = PETSC_OFFLOAD_BOTH; 502 /* lower triangular indices */ 503 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 504 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 505 if (!row_identity) { 506 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 507 cusparseTriFactors->rpermIndices->assign(r, r+n); 508 } 509 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 510 511 /* upper triangular indices */ 512 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 513 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 514 if (!col_identity) { 515 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 516 cusparseTriFactors->cpermIndices->assign(c, c+n); 517 } 518 519 if (!row_identity && !col_identity) { 520 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 521 } else if(!row_identity) { 522 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 523 } else if(!col_identity) { 524 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 525 } 526 527 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 528 PetscFunctionReturn(0); 529 } 530 531 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 532 { 533 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 534 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 535 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 536 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 537 cusparseStatus_t stat; 538 PetscErrorCode ierr; 539 cudaError_t cerr; 540 PetscInt *AiUp, *AjUp; 541 PetscScalar *AAUp; 542 PetscScalar *AALo; 543 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 544 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 545 const PetscInt *ai = b->i,*aj = b->j,*vj; 546 const MatScalar *aa = b->a,*v; 547 548 PetscFunctionBegin; 549 if (!n) PetscFunctionReturn(0); 550 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 551 try { 552 /* Allocate Space for the upper triangular matrix */ 553 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 554 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 555 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 556 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 557 558 /* Fill the upper triangular matrix */ 559 AiUp[0]=(PetscInt) 0; 560 AiUp[n]=nzUpper; 561 offset = 0; 562 for (i=0; i<n; i++) { 563 /* set the pointers */ 564 v = aa + ai[i]; 565 vj = aj + ai[i]; 566 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 567 568 /* first, set the diagonal elements */ 569 AjUp[offset] = (PetscInt) i; 570 AAUp[offset] = (MatScalar)1.0/v[nz]; 571 AiUp[i] = offset; 572 AALo[offset] = (MatScalar)1.0/v[nz]; 573 574 offset+=1; 575 if (nz>0) { 576 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 577 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 578 for (j=offset; j<offset+nz; j++) { 579 AAUp[j] = -AAUp[j]; 580 AALo[j] = AAUp[j]/v[nz]; 581 } 582 offset+=nz; 583 } 584 } 585 586 /* allocate space for the triangular factor information */ 587 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 588 589 /* Create the matrix description */ 590 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 591 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 592 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 593 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 594 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 595 596 /* Create the solve analysis information */ 597 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 598 599 /* set the operation */ 600 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 601 602 /* set the matrix */ 603 upTriFactor->csrMat = new CsrMatrix; 604 upTriFactor->csrMat->num_rows = A->rmap->n; 605 upTriFactor->csrMat->num_cols = A->cmap->n; 606 upTriFactor->csrMat->num_entries = a->nz; 607 608 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 609 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 610 611 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 612 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 613 614 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 615 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 616 617 /* perform the solve analysis */ 618 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 619 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 620 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 621 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 622 623 /* assign the pointer. Is this really necessary? */ 624 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 625 626 /* allocate space for the triangular factor information */ 627 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 628 629 /* Create the matrix description */ 630 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 631 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 632 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 633 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 634 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 635 636 /* Create the solve analysis information */ 637 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 638 639 /* set the operation */ 640 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 641 642 /* set the matrix */ 643 loTriFactor->csrMat = new CsrMatrix; 644 loTriFactor->csrMat->num_rows = A->rmap->n; 645 loTriFactor->csrMat->num_cols = A->cmap->n; 646 loTriFactor->csrMat->num_entries = a->nz; 647 648 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 649 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 650 651 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 652 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 653 654 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 655 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 656 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 657 658 /* perform the solve analysis */ 659 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 660 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 661 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 662 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 663 664 /* assign the pointer. Is this really necessary? */ 665 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 666 667 A->offloadmask = PETSC_OFFLOAD_BOTH; 668 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 669 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 670 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 671 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 672 } catch(char *ex) { 673 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 674 } 675 } 676 PetscFunctionReturn(0); 677 } 678 679 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 680 { 681 PetscErrorCode ierr; 682 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 683 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 684 IS ip = a->row; 685 const PetscInt *rip; 686 PetscBool perm_identity; 687 PetscInt n = A->rmap->n; 688 689 PetscFunctionBegin; 690 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 691 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 692 cusparseTriFactors->workVector = new THRUSTARRAY(n); 693 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 694 695 /* lower triangular indices */ 696 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 697 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 698 if (!perm_identity) { 699 IS iip; 700 const PetscInt *irip; 701 702 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 703 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 704 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 705 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 706 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 707 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 708 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 709 ierr = ISDestroy(&iip);CHKERRQ(ierr); 710 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 711 } 712 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 713 PetscFunctionReturn(0); 714 } 715 716 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 717 { 718 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 719 IS isrow = b->row,iscol = b->col; 720 PetscBool row_identity,col_identity; 721 PetscErrorCode ierr; 722 723 PetscFunctionBegin; 724 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 725 B->offloadmask = PETSC_OFFLOAD_CPU; 726 /* determine which version of MatSolve needs to be used. */ 727 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 728 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 729 if (row_identity && col_identity) { 730 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 731 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 732 B->ops->matsolve = NULL; 733 B->ops->matsolvetranspose = NULL; 734 } else { 735 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 736 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 737 B->ops->matsolve = NULL; 738 B->ops->matsolvetranspose = NULL; 739 } 740 741 /* get the triangular factors */ 742 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 743 PetscFunctionReturn(0); 744 } 745 746 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 747 { 748 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 749 IS ip = b->row; 750 PetscBool perm_identity; 751 PetscErrorCode ierr; 752 753 PetscFunctionBegin; 754 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 755 B->offloadmask = PETSC_OFFLOAD_CPU; 756 /* determine which version of MatSolve needs to be used. */ 757 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 758 if (perm_identity) { 759 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 760 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 761 B->ops->matsolve = NULL; 762 B->ops->matsolvetranspose = NULL; 763 } else { 764 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 765 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 766 B->ops->matsolve = NULL; 767 B->ops->matsolvetranspose = NULL; 768 } 769 770 /* get the triangular factors */ 771 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 772 PetscFunctionReturn(0); 773 } 774 775 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 776 { 777 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 778 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 779 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 780 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 781 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 782 cusparseStatus_t stat; 783 cusparseIndexBase_t indexBase; 784 cusparseMatrixType_t matrixType; 785 cusparseFillMode_t fillMode; 786 cusparseDiagType_t diagType; 787 788 PetscFunctionBegin; 789 790 /*********************************************/ 791 /* Now the Transpose of the Lower Tri Factor */ 792 /*********************************************/ 793 794 /* allocate space for the transpose of the lower triangular factor */ 795 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 796 797 /* set the matrix descriptors of the lower triangular factor */ 798 matrixType = cusparseGetMatType(loTriFactor->descr); 799 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 800 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 801 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 802 diagType = cusparseGetMatDiagType(loTriFactor->descr); 803 804 /* Create the matrix description */ 805 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 806 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 807 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 808 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 809 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 810 811 /* Create the solve analysis information */ 812 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 813 814 /* set the operation */ 815 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 816 817 /* allocate GPU space for the CSC of the lower triangular factor*/ 818 loTriFactorT->csrMat = new CsrMatrix; 819 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 820 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 821 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 822 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 823 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 824 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 825 826 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 827 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 828 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 829 loTriFactor->csrMat->values->data().get(), 830 loTriFactor->csrMat->row_offsets->data().get(), 831 loTriFactor->csrMat->column_indices->data().get(), 832 loTriFactorT->csrMat->values->data().get(), 833 loTriFactorT->csrMat->column_indices->data().get(), 834 loTriFactorT->csrMat->row_offsets->data().get(), 835 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 836 837 /* perform the solve analysis on the transposed matrix */ 838 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 839 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 840 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 841 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 842 loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 843 844 /* assign the pointer. Is this really necessary? */ 845 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 846 847 /*********************************************/ 848 /* Now the Transpose of the Upper Tri Factor */ 849 /*********************************************/ 850 851 /* allocate space for the transpose of the upper triangular factor */ 852 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 853 854 /* set the matrix descriptors of the upper triangular factor */ 855 matrixType = cusparseGetMatType(upTriFactor->descr); 856 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 857 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 858 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 859 diagType = cusparseGetMatDiagType(upTriFactor->descr); 860 861 /* Create the matrix description */ 862 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 863 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 864 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 865 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 866 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 867 868 /* Create the solve analysis information */ 869 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 870 871 /* set the operation */ 872 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 873 874 /* allocate GPU space for the CSC of the upper triangular factor*/ 875 upTriFactorT->csrMat = new CsrMatrix; 876 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 877 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 878 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 879 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 880 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 881 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 882 883 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 884 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 885 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 886 upTriFactor->csrMat->values->data().get(), 887 upTriFactor->csrMat->row_offsets->data().get(), 888 upTriFactor->csrMat->column_indices->data().get(), 889 upTriFactorT->csrMat->values->data().get(), 890 upTriFactorT->csrMat->column_indices->data().get(), 891 upTriFactorT->csrMat->row_offsets->data().get(), 892 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 893 894 /* perform the solve analysis on the transposed matrix */ 895 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 896 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 897 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 898 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 899 upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 900 901 /* assign the pointer. Is this really necessary? */ 902 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 903 PetscFunctionReturn(0); 904 } 905 906 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 907 { 908 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 909 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 910 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 911 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 912 cusparseStatus_t stat; 913 cusparseIndexBase_t indexBase; 914 cudaError_t err; 915 PetscErrorCode ierr; 916 917 PetscFunctionBegin; 918 if (!cusparsestruct->transgen || cusparsestruct->matTranspose) PetscFunctionReturn(0); 919 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 920 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 921 /* create cusparse matrix */ 922 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 923 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 924 indexBase = cusparseGetMatIndexBase(matstruct->descr); 925 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 926 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 927 928 /* set alpha and beta */ 929 err = cudaMalloc((void **)&(matstructT->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 930 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 931 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 932 err = cudaMemcpy(matstructT->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 933 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 934 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 935 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 936 937 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 938 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 939 CsrMatrix *matrixT= new CsrMatrix; 940 matrixT->num_rows = A->cmap->n; 941 matrixT->num_cols = A->rmap->n; 942 matrixT->num_entries = a->nz; 943 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 944 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 945 matrixT->values = new THRUSTARRAY(a->nz); 946 947 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 948 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 949 /* compute the transpose, i.e. the CSC */ 950 indexBase = cusparseGetMatIndexBase(matstruct->descr); 951 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 952 A->cmap->n, matrix->num_entries, 953 matrix->values->data().get(), 954 cusparsestruct->rowoffsets_gpu->data().get(), 955 matrix->column_indices->data().get(), 956 matrixT->values->data().get(), 957 matrixT->column_indices->data().get(), 958 matrixT->row_offsets->data().get(), 959 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 960 matstructT->mat = matrixT; 961 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 962 CsrMatrix *temp = new CsrMatrix; 963 CsrMatrix *tempT = new CsrMatrix; 964 965 /* First convert HYB to CSR */ 966 temp->num_rows = A->rmap->n; 967 temp->num_cols = A->cmap->n; 968 temp->num_entries = a->nz; 969 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 970 temp->column_indices = new THRUSTINTARRAY32(a->nz); 971 temp->values = new THRUSTARRAY(a->nz); 972 973 stat = cusparse_hyb2csr(cusparsestruct->handle, 974 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 975 temp->values->data().get(), 976 temp->row_offsets->data().get(), 977 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 978 979 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 980 tempT->num_rows = A->rmap->n; 981 tempT->num_cols = A->cmap->n; 982 tempT->num_entries = a->nz; 983 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 984 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 985 tempT->values = new THRUSTARRAY(a->nz); 986 987 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 988 temp->num_cols, temp->num_entries, 989 temp->values->data().get(), 990 temp->row_offsets->data().get(), 991 temp->column_indices->data().get(), 992 tempT->values->data().get(), 993 tempT->column_indices->data().get(), 994 tempT->row_offsets->data().get(), 995 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 996 997 /* Last, convert CSC to HYB */ 998 cusparseHybMat_t hybMat; 999 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1000 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1001 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1002 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1003 matstructT->descr, tempT->values->data().get(), 1004 tempT->row_offsets->data().get(), 1005 tempT->column_indices->data().get(), 1006 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1007 1008 /* assign the pointer */ 1009 matstructT->mat = hybMat; 1010 /* delete temporaries */ 1011 if (tempT) { 1012 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1013 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1014 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1015 delete (CsrMatrix*) tempT; 1016 } 1017 if (temp) { 1018 if (temp->values) delete (THRUSTARRAY*) temp->values; 1019 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1020 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1021 delete (CsrMatrix*) temp; 1022 } 1023 } 1024 err = WaitForGPU();CHKERRCUDA(err); 1025 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1026 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1027 /* the compressed row indices is not used for matTranspose */ 1028 matstructT->cprowIndices = NULL; 1029 /* assign the pointer */ 1030 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1031 PetscFunctionReturn(0); 1032 } 1033 1034 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1035 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1036 { 1037 PetscInt n = xx->map->n; 1038 const PetscScalar *barray; 1039 PetscScalar *xarray; 1040 thrust::device_ptr<const PetscScalar> bGPU; 1041 thrust::device_ptr<PetscScalar> xGPU; 1042 cusparseStatus_t stat; 1043 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1044 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1045 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1046 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1047 PetscErrorCode ierr; 1048 cudaError_t cerr; 1049 1050 PetscFunctionBegin; 1051 /* Analyze the matrix and create the transpose ... on the fly */ 1052 if (!loTriFactorT && !upTriFactorT) { 1053 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1054 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1055 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1056 } 1057 1058 /* Get the GPU pointers */ 1059 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1060 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1061 xGPU = thrust::device_pointer_cast(xarray); 1062 bGPU = thrust::device_pointer_cast(barray); 1063 1064 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1065 /* First, reorder with the row permutation */ 1066 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1067 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1068 xGPU); 1069 1070 /* First, solve U */ 1071 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1072 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1073 upTriFactorT->csrMat->values->data().get(), 1074 upTriFactorT->csrMat->row_offsets->data().get(), 1075 upTriFactorT->csrMat->column_indices->data().get(), 1076 upTriFactorT->solveInfo, 1077 xarray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1078 1079 /* Then, solve L */ 1080 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1081 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1082 loTriFactorT->csrMat->values->data().get(), 1083 loTriFactorT->csrMat->row_offsets->data().get(), 1084 loTriFactorT->csrMat->column_indices->data().get(), 1085 loTriFactorT->solveInfo, 1086 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1087 1088 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1089 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1090 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1091 tempGPU->begin()); 1092 1093 /* Copy the temporary to the full solution. */ 1094 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1095 1096 /* restore */ 1097 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1098 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1099 cerr = WaitForGPU();CHKERRCUDA(cerr); 1100 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1101 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1102 PetscFunctionReturn(0); 1103 } 1104 1105 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1106 { 1107 const PetscScalar *barray; 1108 PetscScalar *xarray; 1109 cusparseStatus_t stat; 1110 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1111 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1112 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1113 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1114 PetscErrorCode ierr; 1115 cudaError_t cerr; 1116 1117 PetscFunctionBegin; 1118 /* Analyze the matrix and create the transpose ... on the fly */ 1119 if (!loTriFactorT && !upTriFactorT) { 1120 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1121 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1122 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1123 } 1124 1125 /* Get the GPU pointers */ 1126 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1127 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1128 1129 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1130 /* First, solve U */ 1131 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1132 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1133 upTriFactorT->csrMat->values->data().get(), 1134 upTriFactorT->csrMat->row_offsets->data().get(), 1135 upTriFactorT->csrMat->column_indices->data().get(), 1136 upTriFactorT->solveInfo, 1137 barray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1138 1139 /* Then, solve L */ 1140 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1141 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1142 loTriFactorT->csrMat->values->data().get(), 1143 loTriFactorT->csrMat->row_offsets->data().get(), 1144 loTriFactorT->csrMat->column_indices->data().get(), 1145 loTriFactorT->solveInfo, 1146 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1147 1148 /* restore */ 1149 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1150 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1151 cerr = WaitForGPU();CHKERRCUDA(cerr); 1152 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1153 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1154 PetscFunctionReturn(0); 1155 } 1156 1157 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1158 { 1159 const PetscScalar *barray; 1160 PetscScalar *xarray; 1161 thrust::device_ptr<const PetscScalar> bGPU; 1162 thrust::device_ptr<PetscScalar> xGPU; 1163 cusparseStatus_t stat; 1164 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1165 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1166 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1167 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1168 PetscErrorCode ierr; 1169 cudaError_t cerr; 1170 1171 PetscFunctionBegin; 1172 1173 /* Get the GPU pointers */ 1174 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1175 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1176 xGPU = thrust::device_pointer_cast(xarray); 1177 bGPU = thrust::device_pointer_cast(barray); 1178 1179 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1180 /* First, reorder with the row permutation */ 1181 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1182 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1183 tempGPU->begin()); 1184 1185 /* Next, solve L */ 1186 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1187 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1188 loTriFactor->csrMat->values->data().get(), 1189 loTriFactor->csrMat->row_offsets->data().get(), 1190 loTriFactor->csrMat->column_indices->data().get(), 1191 loTriFactor->solveInfo, 1192 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1193 1194 /* Then, solve U */ 1195 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1196 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1197 upTriFactor->csrMat->values->data().get(), 1198 upTriFactor->csrMat->row_offsets->data().get(), 1199 upTriFactor->csrMat->column_indices->data().get(), 1200 upTriFactor->solveInfo, 1201 xarray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1202 1203 /* Last, reorder with the column permutation */ 1204 thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1205 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1206 xGPU); 1207 1208 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1209 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1210 cerr = WaitForGPU();CHKERRCUDA(cerr); 1211 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1212 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1213 PetscFunctionReturn(0); 1214 } 1215 1216 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1217 { 1218 const PetscScalar *barray; 1219 PetscScalar *xarray; 1220 cusparseStatus_t stat; 1221 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1222 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1223 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1224 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1225 PetscErrorCode ierr; 1226 cudaError_t cerr; 1227 1228 PetscFunctionBegin; 1229 /* Get the GPU pointers */ 1230 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1231 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1232 1233 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1234 /* First, solve L */ 1235 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1236 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1237 loTriFactor->csrMat->values->data().get(), 1238 loTriFactor->csrMat->row_offsets->data().get(), 1239 loTriFactor->csrMat->column_indices->data().get(), 1240 loTriFactor->solveInfo, 1241 barray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1242 1243 /* Next, solve U */ 1244 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1245 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1246 upTriFactor->csrMat->values->data().get(), 1247 upTriFactor->csrMat->row_offsets->data().get(), 1248 upTriFactor->csrMat->column_indices->data().get(), 1249 upTriFactor->solveInfo, 1250 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1251 1252 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1253 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1254 cerr = WaitForGPU();CHKERRCUDA(cerr); 1255 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1256 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1257 PetscFunctionReturn(0); 1258 } 1259 1260 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1261 { 1262 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1263 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1264 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1265 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1266 PetscErrorCode ierr; 1267 cusparseStatus_t stat; 1268 cudaError_t err; 1269 1270 PetscFunctionBegin; 1271 if (A->boundtocpu) PetscFunctionReturn(0); 1272 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1273 if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1274 /* Copy values only */ 1275 CsrMatrix *mat,*matT; 1276 mat = (CsrMatrix*)cusparsestruct->mat->mat; 1277 1278 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1279 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1280 mat->values->assign(a->a, a->a+a->nz); 1281 err = WaitForGPU();CHKERRCUDA(err); 1282 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1283 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1284 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1285 1286 /* Update matT when it was built before */ 1287 if (cusparsestruct->matTranspose) { 1288 cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr); 1289 matT = (CsrMatrix*)cusparsestruct->matTranspose->mat; 1290 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1291 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1292 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1293 A->cmap->n, mat->num_entries, 1294 mat->values->data().get(), 1295 cusparsestruct->rowoffsets_gpu->data().get(), 1296 mat->column_indices->data().get(), 1297 matT->values->data().get(), 1298 matT->column_indices->data().get(), 1299 matT->row_offsets->data().get(), 1300 CUSPARSE_ACTION_NUMERIC,indexBase);CHKERRCUSPARSE(stat); 1301 err = WaitForGPU();CHKERRCUDA(err); 1302 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1303 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1304 } 1305 } else { 1306 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1307 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1308 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1309 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr); 1310 delete cusparsestruct->workVector; 1311 delete cusparsestruct->rowoffsets_gpu; 1312 try { 1313 if (a->compressedrow.use) { 1314 m = a->compressedrow.nrows; 1315 ii = a->compressedrow.i; 1316 ridx = a->compressedrow.rindex; 1317 } else { 1318 m = A->rmap->n; 1319 ii = a->i; 1320 ridx = NULL; 1321 } 1322 cusparsestruct->nrows = m; 1323 1324 /* create cusparse matrix */ 1325 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1326 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1327 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1328 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1329 1330 err = cudaMalloc((void **)&(matstruct->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 1331 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1332 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1333 err = cudaMemcpy(matstruct->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1334 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1335 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1336 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1337 1338 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1339 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1340 /* set the matrix */ 1341 CsrMatrix *matrix= new CsrMatrix; 1342 matrix->num_rows = m; 1343 matrix->num_cols = A->cmap->n; 1344 matrix->num_entries = a->nz; 1345 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1346 matrix->row_offsets->assign(ii, ii + m+1); 1347 1348 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1349 matrix->column_indices->assign(a->j, a->j+a->nz); 1350 1351 matrix->values = new THRUSTARRAY(a->nz); 1352 matrix->values->assign(a->a, a->a+a->nz); 1353 1354 /* assign the pointer */ 1355 matstruct->mat = matrix; 1356 1357 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1358 CsrMatrix *matrix= new CsrMatrix; 1359 matrix->num_rows = m; 1360 matrix->num_cols = A->cmap->n; 1361 matrix->num_entries = a->nz; 1362 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1363 matrix->row_offsets->assign(ii, ii + m+1); 1364 1365 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1366 matrix->column_indices->assign(a->j, a->j+a->nz); 1367 1368 matrix->values = new THRUSTARRAY(a->nz); 1369 matrix->values->assign(a->a, a->a+a->nz); 1370 1371 cusparseHybMat_t hybMat; 1372 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1373 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1374 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1375 stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1376 matstruct->descr, matrix->values->data().get(), 1377 matrix->row_offsets->data().get(), 1378 matrix->column_indices->data().get(), 1379 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1380 /* assign the pointer */ 1381 matstruct->mat = hybMat; 1382 1383 if (matrix) { 1384 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1385 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1386 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1387 delete (CsrMatrix*)matrix; 1388 } 1389 } 1390 1391 /* assign the compressed row indices */ 1392 if (a->compressedrow.use) { 1393 cusparsestruct->workVector = new THRUSTARRAY(m); 1394 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1395 matstruct->cprowIndices->assign(ridx,ridx+m); 1396 tmp = m; 1397 } else { 1398 cusparsestruct->workVector = NULL; 1399 matstruct->cprowIndices = NULL; 1400 tmp = 0; 1401 } 1402 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1403 1404 /* assign the pointer */ 1405 cusparsestruct->mat = matstruct; 1406 } catch(char *ex) { 1407 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1408 } 1409 err = WaitForGPU();CHKERRCUDA(err); 1410 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1411 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1412 cusparsestruct->nonzerostate = A->nonzerostate; 1413 } 1414 A->offloadmask = PETSC_OFFLOAD_BOTH; 1415 } 1416 PetscFunctionReturn(0); 1417 } 1418 1419 struct VecCUDAPlusEquals 1420 { 1421 template <typename Tuple> 1422 __host__ __device__ 1423 void operator()(Tuple t) 1424 { 1425 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1426 } 1427 }; 1428 1429 struct VecCUDAEqualsReverse 1430 { 1431 template <typename Tuple> 1432 __host__ __device__ 1433 void operator()(Tuple t) 1434 { 1435 thrust::get<0>(t) = thrust::get<1>(t); 1436 } 1437 }; 1438 1439 typedef struct { 1440 PetscBool cisdense; 1441 PetscScalar *Bt; 1442 Mat X; 1443 } MatMatCusparse; 1444 1445 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1446 { 1447 PetscErrorCode ierr; 1448 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1449 cudaError_t cerr; 1450 1451 PetscFunctionBegin; 1452 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1453 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1454 ierr = PetscFree(data);CHKERRQ(ierr); 1455 PetscFunctionReturn(0); 1456 } 1457 1458 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1459 1460 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1461 { 1462 Mat_Product *product = C->product; 1463 Mat A,B; 1464 PetscInt m,n,k,blda,clda; 1465 PetscBool flg,biscuda; 1466 Mat_SeqAIJCUSPARSE *cusp; 1467 cusparseStatus_t stat; 1468 cusparseOperation_t opA; 1469 cusparseMatDescr_t matA; 1470 const PetscScalar *barray; 1471 PetscScalar *carray; 1472 PetscErrorCode ierr; 1473 MatMatCusparse *mmdata; 1474 Mat_SeqAIJCUSPARSEMultStruct *mat; 1475 CsrMatrix *csrmat; 1476 cudaError_t cuer; 1477 1478 PetscFunctionBegin; 1479 MatCheckProduct(C,1); 1480 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 1481 mmdata = (MatMatCusparse*)product->data; 1482 A = product->A; 1483 B = product->B; 1484 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1485 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1486 /* currently CopyToGpu does not copy if the matrix is bound to CPU 1487 Instead of silently accepting the wrong answer, I prefer to raise the error */ 1488 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 1489 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1490 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1491 switch (product->type) { 1492 case MATPRODUCT_AB: 1493 case MATPRODUCT_PtAP: 1494 mat = cusp->mat; 1495 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1496 m = A->rmap->n; 1497 k = A->cmap->n; 1498 n = B->cmap->n; 1499 break; 1500 case MATPRODUCT_AtB: 1501 if (!cusp->transgen) { 1502 mat = cusp->mat; 1503 opA = CUSPARSE_OPERATION_TRANSPOSE; 1504 } else { 1505 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1506 mat = cusp->matTranspose; 1507 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1508 } 1509 m = A->cmap->n; 1510 k = A->rmap->n; 1511 n = B->cmap->n; 1512 break; 1513 case MATPRODUCT_ABt: 1514 case MATPRODUCT_RARt: 1515 mat = cusp->mat; 1516 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1517 m = A->rmap->n; 1518 k = B->cmap->n; 1519 n = B->rmap->n; 1520 break; 1521 default: 1522 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1523 } 1524 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 1525 matA = mat->descr; 1526 csrmat = (CsrMatrix*)mat->mat; 1527 /* if the user passed a CPU matrix, copy the data to the GPU */ 1528 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 1529 if (!biscuda) { 1530 ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 1531 } 1532 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 1533 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 1534 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 1535 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1536 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 1537 } else { 1538 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 1539 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 1540 } 1541 1542 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1543 1544 /* cusparse spmm does not support transpose on B */ 1545 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 1546 cublasHandle_t cublasv2handle; 1547 cublasStatus_t cerr; 1548 1549 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 1550 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 1551 B->cmap->n,B->rmap->n, 1552 &PETSC_CUSPARSE_ONE ,barray,blda, 1553 &PETSC_CUSPARSE_ZERO,barray,blda, 1554 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 1555 blda = B->cmap->n; 1556 } 1557 1558 /* perform the MatMat operation */ 1559 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 1560 csrmat->num_entries,mat->alpha,matA, 1561 csrmat->values->data().get(), 1562 csrmat->row_offsets->data().get(), 1563 csrmat->column_indices->data().get(), 1564 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 1565 carray,clda);CHKERRCUSPARSE(stat); 1566 cuer = WaitForGPU();CHKERRCUDA(cuer); 1567 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1568 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 1569 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 1570 if (product->type == MATPRODUCT_RARt) { 1571 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1572 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 1573 } else if (product->type == MATPRODUCT_PtAP) { 1574 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1575 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 1576 } else { 1577 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 1578 } 1579 if (mmdata->cisdense) { 1580 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 1581 } 1582 if (!biscuda) { 1583 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 1584 } 1585 PetscFunctionReturn(0); 1586 } 1587 1588 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1589 { 1590 Mat_Product *product = C->product; 1591 Mat A,B; 1592 PetscInt m,n; 1593 PetscBool cisdense,flg; 1594 PetscErrorCode ierr; 1595 MatMatCusparse *mmdata; 1596 Mat_SeqAIJCUSPARSE *cusp; 1597 1598 PetscFunctionBegin; 1599 MatCheckProduct(C,1); 1600 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 1601 A = product->A; 1602 B = product->B; 1603 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1604 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1605 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1606 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 1607 switch (product->type) { 1608 case MATPRODUCT_AB: 1609 m = A->rmap->n; 1610 n = B->cmap->n; 1611 break; 1612 case MATPRODUCT_AtB: 1613 m = A->cmap->n; 1614 n = B->cmap->n; 1615 break; 1616 case MATPRODUCT_ABt: 1617 m = A->rmap->n; 1618 n = B->rmap->n; 1619 break; 1620 case MATPRODUCT_PtAP: 1621 m = B->cmap->n; 1622 n = B->cmap->n; 1623 break; 1624 case MATPRODUCT_RARt: 1625 m = B->rmap->n; 1626 n = B->rmap->n; 1627 break; 1628 default: 1629 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1630 } 1631 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 1632 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 1633 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 1634 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 1635 1636 /* product data */ 1637 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 1638 mmdata->cisdense = cisdense; 1639 /* cusparse spmm does not support transpose on B */ 1640 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 1641 cudaError_t cerr; 1642 1643 cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 1644 } 1645 /* for these products we need intermediate storage */ 1646 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 1647 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 1648 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 1649 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 1650 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 1651 } else { 1652 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 1653 } 1654 } 1655 C->product->data = mmdata; 1656 C->product->destroy = MatDestroy_MatMatCusparse; 1657 1658 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 1659 PetscFunctionReturn(0); 1660 } 1661 1662 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 1663 1664 /* handles dense B */ 1665 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C) 1666 { 1667 Mat_Product *product = C->product; 1668 PetscErrorCode ierr; 1669 1670 PetscFunctionBegin; 1671 MatCheckProduct(C,1); 1672 if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A"); 1673 if (product->A->boundtocpu) { 1674 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr); 1675 PetscFunctionReturn(0); 1676 } 1677 switch (product->type) { 1678 case MATPRODUCT_AB: 1679 case MATPRODUCT_AtB: 1680 case MATPRODUCT_ABt: 1681 case MATPRODUCT_PtAP: 1682 case MATPRODUCT_RARt: 1683 C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 1684 default: 1685 break; 1686 } 1687 PetscFunctionReturn(0); 1688 } 1689 1690 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1691 { 1692 PetscErrorCode ierr; 1693 1694 PetscFunctionBegin; 1695 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 1696 PetscFunctionReturn(0); 1697 } 1698 1699 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 1700 { 1701 PetscErrorCode ierr; 1702 1703 PetscFunctionBegin; 1704 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 1705 PetscFunctionReturn(0); 1706 } 1707 1708 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1709 { 1710 PetscErrorCode ierr; 1711 1712 PetscFunctionBegin; 1713 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 1714 PetscFunctionReturn(0); 1715 } 1716 1717 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1718 { 1719 PetscErrorCode ierr; 1720 1721 PetscFunctionBegin; 1722 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 1723 PetscFunctionReturn(0); 1724 } 1725 1726 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1727 { 1728 PetscErrorCode ierr; 1729 1730 PetscFunctionBegin; 1731 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 1732 PetscFunctionReturn(0); 1733 } 1734 1735 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 1736 { 1737 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1738 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1739 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 1740 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 1741 PetscErrorCode ierr; 1742 cudaError_t cerr; 1743 cusparseStatus_t stat; 1744 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1745 PetscBool compressed; 1746 1747 PetscFunctionBegin; 1748 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported"); 1749 if (!a->nonzerorowcnt) { 1750 if (!yy) { 1751 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 1752 } 1753 PetscFunctionReturn(0); 1754 } 1755 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1756 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1757 if (!trans) { 1758 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1759 } else { 1760 if (herm || !cusparsestruct->transgen) { 1761 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 1762 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1763 } else { 1764 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1765 if (!matstruct) { 1766 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1767 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1768 } 1769 } 1770 } 1771 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 1772 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 1773 1774 try { 1775 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 1776 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 1777 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 1778 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1779 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 1780 xptr = xarray; 1781 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; /* Use a work vector for SpMv output if compressed */ 1782 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 1783 } else { 1784 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; /* Use a work vector for SpMv input if compressed */ 1785 dptr = zarray; 1786 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 1787 1788 if (compressed) { 1789 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 1790 1791 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 1792 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 1793 VecCUDAEqualsReverse()); 1794 } 1795 } 1796 1797 /* csr_spmv does y = alpha*Ax + beta*y */ 1798 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1799 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1800 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 1801 mat->num_rows, mat->num_cols, 1802 mat->num_entries, matstruct->alpha, matstruct->descr, 1803 mat->values->data().get(), mat->row_offsets->data().get(), 1804 mat->column_indices->data().get(), xptr, beta, 1805 dptr);CHKERRCUSPARSE(stat); 1806 } else { 1807 if (cusparsestruct->nrows) { 1808 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1809 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 1810 matstruct->alpha, matstruct->descr, hybMat, 1811 xptr, beta, 1812 dptr);CHKERRCUSPARSE(stat); 1813 } 1814 } 1815 cerr = WaitForGPU();CHKERRCUDA(cerr); 1816 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1817 1818 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 1819 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 1820 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 1821 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 1822 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 1823 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 1824 } 1825 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 1826 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 1827 } 1828 1829 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 1830 if (compressed) { 1831 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 1832 1833 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1834 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 1835 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 1836 VecCUDAPlusEquals()); 1837 cerr = WaitForGPU();CHKERRCUDA(cerr); 1838 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1839 } 1840 } else { 1841 if (yy && yy != zz) { 1842 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 1843 } 1844 } 1845 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 1846 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 1847 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 1848 } catch(char *ex) { 1849 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1850 } 1851 if (yy) { 1852 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1853 } else { 1854 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 1855 } 1856 PetscFunctionReturn(0); 1857 } 1858 1859 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1860 { 1861 PetscErrorCode ierr; 1862 1863 PetscFunctionBegin; 1864 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 1865 PetscFunctionReturn(0); 1866 } 1867 1868 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1869 { 1870 PetscErrorCode ierr; 1871 1872 PetscFunctionBegin; 1873 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1874 if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 1875 if (A->factortype == MAT_FACTOR_NONE) { 1876 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1877 } 1878 PetscFunctionReturn(0); 1879 } 1880 1881 /* --------------------------------------------------------------------------------*/ 1882 /*@ 1883 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1884 (the default parallel PETSc format). This matrix will ultimately pushed down 1885 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1886 assembly performance the user should preallocate the matrix storage by setting 1887 the parameter nz (or the array nnz). By setting these parameters accurately, 1888 performance during matrix assembly can be increased by more than a factor of 50. 1889 1890 Collective 1891 1892 Input Parameters: 1893 + comm - MPI communicator, set to PETSC_COMM_SELF 1894 . m - number of rows 1895 . n - number of columns 1896 . nz - number of nonzeros per row (same for all rows) 1897 - nnz - array containing the number of nonzeros in the various rows 1898 (possibly different for each row) or NULL 1899 1900 Output Parameter: 1901 . A - the matrix 1902 1903 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1904 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1905 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1906 1907 Notes: 1908 If nnz is given then nz is ignored 1909 1910 The AIJ format (also called the Yale sparse matrix format or 1911 compressed row storage), is fully compatible with standard Fortran 77 1912 storage. That is, the stored row and column indices can begin at 1913 either one (as in Fortran) or zero. See the users' manual for details. 1914 1915 Specify the preallocated storage with either nz or nnz (not both). 1916 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1917 allocation. For large problems you MUST preallocate memory or you 1918 will get TERRIBLE performance, see the users' manual chapter on matrices. 1919 1920 By default, this format uses inodes (identical nodes) when possible, to 1921 improve numerical efficiency of matrix-vector products and solves. We 1922 search for consecutive rows with the same nonzero structure, thereby 1923 reusing matrix information to achieve increased efficiency. 1924 1925 Level: intermediate 1926 1927 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1928 @*/ 1929 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1930 { 1931 PetscErrorCode ierr; 1932 1933 PetscFunctionBegin; 1934 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1935 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1936 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1937 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1938 PetscFunctionReturn(0); 1939 } 1940 1941 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1942 { 1943 PetscErrorCode ierr; 1944 1945 PetscFunctionBegin; 1946 if (A->factortype == MAT_FACTOR_NONE) { 1947 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 1948 } else { 1949 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 1950 } 1951 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 1952 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 1953 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 1954 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 1955 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1956 PetscFunctionReturn(0); 1957 } 1958 1959 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 1960 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 1961 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 1962 { 1963 PetscErrorCode ierr; 1964 1965 PetscFunctionBegin; 1966 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 1967 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 1968 PetscFunctionReturn(0); 1969 } 1970 1971 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 1972 { 1973 PetscErrorCode ierr; 1974 1975 PetscFunctionBegin; 1976 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 1977 /* Currently, there is no case in which an AIJCUSPARSE matrix ever has its offloadmask set to PETS_OFFLOAD_GPU. 1978 If this changes, we need to implement a routine to update the CPU (host) version of the matrix from the GPU one. 1979 Right now, for safety we simply check for PETSC_OFFLOAD_GPU and have MatBindToCPU() error in this case. 1980 TODO: Add MatAIJCUSPARSECopyFromGPU() and make MatBindToCPU() functional for AIJCUSPARSE matries; 1981 can follow the example of MatBindToCPU_SeqAIJViennaCL(). */ 1982 if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"PETSC_OFFLOAD_GPU should not happen. Please report your use case to petsc-dev@mcs.anl.gov"); 1983 /* TODO: add support for this? */ 1984 if (flg) { 1985 A->ops->mult = MatMult_SeqAIJ; 1986 A->ops->multadd = MatMultAdd_SeqAIJ; 1987 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 1988 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 1989 A->ops->multhermitiantranspose = NULL; 1990 A->ops->multhermitiantransposeadd = NULL; 1991 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 1992 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 1993 } else { 1994 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1995 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1996 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1997 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1998 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 1999 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 2000 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 2001 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 2002 } 2003 A->boundtocpu = flg; 2004 PetscFunctionReturn(0); 2005 } 2006 2007 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 2008 { 2009 PetscErrorCode ierr; 2010 cusparseStatus_t stat; 2011 Mat B; 2012 2013 PetscFunctionBegin; 2014 if (reuse == MAT_INITIAL_MATRIX) { 2015 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 2016 } else if (reuse == MAT_REUSE_MATRIX) { 2017 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 2018 } 2019 B = *newmat; 2020 2021 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 2022 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 2023 2024 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 2025 if (B->factortype == MAT_FACTOR_NONE) { 2026 Mat_SeqAIJCUSPARSE *spptr; 2027 2028 ierr = PetscNew(&spptr);CHKERRQ(ierr); 2029 spptr->format = MAT_CUSPARSE_CSR; 2030 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 2031 B->spptr = spptr; 2032 } else { 2033 Mat_SeqAIJCUSPARSETriFactors *spptr; 2034 2035 ierr = PetscNew(&spptr);CHKERRQ(ierr); 2036 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 2037 B->spptr = spptr; 2038 } 2039 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2040 } 2041 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 2042 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 2043 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 2044 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 2045 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 2046 2047 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 2048 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2049 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 2050 PetscFunctionReturn(0); 2051 } 2052 2053 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 2054 { 2055 PetscErrorCode ierr; 2056 2057 PetscFunctionBegin; 2058 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 2059 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2060 PetscFunctionReturn(0); 2061 } 2062 2063 /*MC 2064 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 2065 2066 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 2067 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 2068 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 2069 2070 Options Database Keys: 2071 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 2072 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2073 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2074 2075 Level: beginner 2076 2077 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 2078 M*/ 2079 2080 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 2081 2082 2083 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 2084 { 2085 PetscErrorCode ierr; 2086 2087 PetscFunctionBegin; 2088 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2089 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2090 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2091 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2092 PetscFunctionReturn(0); 2093 } 2094 2095 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 2096 { 2097 PetscErrorCode ierr; 2098 cusparseStatus_t stat; 2099 cusparseHandle_t handle; 2100 2101 PetscFunctionBegin; 2102 if (*cusparsestruct) { 2103 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 2104 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 2105 delete (*cusparsestruct)->workVector; 2106 delete (*cusparsestruct)->rowoffsets_gpu; 2107 if (handle = (*cusparsestruct)->handle) { 2108 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 2109 } 2110 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 2111 } 2112 PetscFunctionReturn(0); 2113 } 2114 2115 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 2116 { 2117 PetscFunctionBegin; 2118 if (*mat) { 2119 delete (*mat)->values; 2120 delete (*mat)->column_indices; 2121 delete (*mat)->row_offsets; 2122 delete *mat; 2123 *mat = 0; 2124 } 2125 PetscFunctionReturn(0); 2126 } 2127 2128 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 2129 { 2130 cusparseStatus_t stat; 2131 PetscErrorCode ierr; 2132 2133 PetscFunctionBegin; 2134 if (*trifactor) { 2135 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 2136 if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 2137 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 2138 delete *trifactor; 2139 *trifactor = 0; 2140 } 2141 PetscFunctionReturn(0); 2142 } 2143 2144 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 2145 { 2146 CsrMatrix *mat; 2147 cusparseStatus_t stat; 2148 cudaError_t err; 2149 2150 PetscFunctionBegin; 2151 if (*matstruct) { 2152 if ((*matstruct)->mat) { 2153 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 2154 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 2155 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 2156 } else { 2157 mat = (CsrMatrix*)(*matstruct)->mat; 2158 CsrMatrix_Destroy(&mat); 2159 } 2160 } 2161 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 2162 delete (*matstruct)->cprowIndices; 2163 if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); } 2164 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 2165 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 2166 delete *matstruct; 2167 *matstruct = 0; 2168 } 2169 PetscFunctionReturn(0); 2170 } 2171 2172 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors) 2173 { 2174 PetscErrorCode ierr; 2175 2176 PetscFunctionBegin; 2177 if (*trifactors) { 2178 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 2179 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 2180 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 2181 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 2182 delete (*trifactors)->rpermIndices; 2183 delete (*trifactors)->cpermIndices; 2184 delete (*trifactors)->workVector; 2185 (*trifactors)->rpermIndices = 0; 2186 (*trifactors)->cpermIndices = 0; 2187 (*trifactors)->workVector = 0; 2188 } 2189 PetscFunctionReturn(0); 2190 } 2191 2192 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 2193 { 2194 PetscErrorCode ierr; 2195 cusparseHandle_t handle; 2196 cusparseStatus_t stat; 2197 2198 PetscFunctionBegin; 2199 if (*trifactors) { 2200 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 2201 if (handle = (*trifactors)->handle) { 2202 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 2203 } 2204 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 2205 } 2206 PetscFunctionReturn(0); 2207 } 2208